def _pollMbox(self): # return # # def poll(self, irc, msg, args): file_name = self.registryValue('mbox') if not file_name: return boxFile = open(file_name, 'r+b') _lock_file(boxFile) self.log.debug('Polling mbox %r' % boxFile) try: box = mailbox.PortableUnixMailbox(boxFile, _message_factory) bugmails = [] for message in box: if message == '': continue self.log.debug('Parsing message %s' % message['Message-ID']) try: bugmails.append(bugmail.Bugmail(message)) except bugmail.NotBugmailException: continue except: self.log.exception('Exception while parsing message:') self.log.debug("Message:\n%s" % message.as_string()) boxFile.truncate(0) finally: _unlock_file(boxFile) boxFile.close() self._handleBugmails(bugmails)
def process_mbox(self): #open a MBOX file and process all its content self.cache = {} mb = mailbox.PortableUnixMailbox(open(self.path, 'rb')) msg = next(mb) starting, ending = None, None while msg is not None: document = msg.fp.read() if document is not None: m = mbox_email(''.join(msg.headers)) d = msg.getdate('Date') s = m.getSubject() (result, reason) = self.run_rules(s) if result: if s == '': s = '(no subject)' from_addr = m.getFrom() f = from_addr[0] if not f: f = from_addr[1] index = self.get_unique_id(m) start, stop = get_start_stop(msg) self.cache[index] = (index, start, stop - start, s, d, f, m.getMessageID(), m.getInReplyTo(), m.getTo(), m.getCC()) #process starting, ending if starting is None: starting = d else: if d < starting: starting = d if ending is None: ending = d else: if d > ending: ending = d msg = next(mb) self.starting, self.ending = starting, ending mb = None
def filedeconstructor(self, fn): '''Checks if given file object is message or mailbox. If no, returns text contents of file or empty string if file is binary. Parses message/mailbox for relevant headers adding urls to list of items and returns text parts for further searching.''' # binary check from mercurial.util fp = open(fn, 'rb') try: text = fp.read() if '\0' in text: return '' elif self.ui.text: return text msg = _msgfactory(fp) if not msg: return text # else it's a message or a mailbox if not msg['message-id']: hint = ('make sure input is a raw message' ' - in mutt: unset pipe_decode -,' ' or use -t/--text to disable message detection') raise util.DeadMan('no message-id found', hint=hint) if not msg.get_unixfrom(): textlist = self.msgharvest(msg) else: # treat text like a mailbox because it might be one textlist = [] # list of strings to search mbox = mailbox.PortableUnixMailbox(fp, _msgfactory) while msg is not None: msg = mbox.next() if msg: textlist += self.msgharvest(msg) finally: fp.close() return '\n'.join(textlist)
def scan_file(filename, compress, overwrite, nospinner): """Gets IDs of messages in the specified mbox file""" # file will be overwritten if overwrite: return [] else: assert('bzip2' != compress) # file doesn't exist if not os.path.exists(filename): print "File %s: not found" % filename return [] spinner = Spinner("File %s" % filename, nospinner) # open the file if compress == 'gzip': mbox = gzip.GzipFile(filename, 'rb') elif compress == 'bzip2': mbox = bz2.BZ2File(filename, 'rb') else: mbox = file(filename, 'rb') messages = {} # each message i = 0 for message in mailbox.PortableUnixMailbox(mbox): header = '' # We assume all messages on disk have message-ids try: header = ''.join(message.getfirstmatchingheader('message-id')) except KeyError: # No message ID was found. Warn the user and move on print print "WARNING: Message #%d in %s" % (i, filename), print "has no Message-Id header." header = BLANKS_RE.sub(' ', header.strip()) try: msg_id = MSGID_RE.match(header).group(1) if msg_id not in messages.keys(): # avoid adding dupes messages[msg_id] = msg_id except AttributeError: # Message-Id was found but could somehow not be parsed by regexp # (highly bloody unlikely) print print "WARNING: Message #%d in %s" % (i, filename), print "has a malformed Message-Id header." spinner.spin() i = i + 1 # done mbox.close() spinner.stop() print ": %d messages" % (len(messages.keys())) return messages
def mbox_train(h, path, is_spam, force): """Train bayes with a Unix mbox""" if loud: print " Reading as Unix mbox" import mailbox import fcntl # Open and lock the mailbox. Some systems require it be opened for # writes in order to assert an exclusive lock. f = file(path, "r+b") fcntl.flock(f, fcntl.LOCK_EX) mbox = mailbox.PortableUnixMailbox(f, get_message) outf = os.tmpfile() counter = 0 trained = 0 for msg in mbox: if not msg: print "Malformed message number %d. I can't train on this mbox, sorry." % counter return counter += 1 if loud and counter % 10 == 0: sys.stdout.write("\r%6d" % counter) sys.stdout.flush() if msg_train(h, msg, is_spam, force): trained += 1 if options["Headers", "include_trained"]: # Write it out with the Unix "From " line outf.write(mboxutils.as_string(msg, True)) if options["Headers", "include_trained"]: outf.seek(0) try: os.ftruncate(f.fileno(), 0) f.seek(0) except: # If anything goes wrong, don't try to write print "Problem truncating mbox--nothing written" raise try: for line in outf.xreadlines(): f.write(line) except: print >> sys.stderr("Problem writing mbox! Sorry, " "I tried my best, but your mail " "may be corrupted.") raise fcntl.flock(f, fcntl.LOCK_UN) f.close() if loud: sys.stdout.write("\r%6d" % counter) sys.stdout.write("\r Trained %d out of %d messages\n" % (trained, counter))
def main(mailbox_path): addresses = {} mb = mailbox.PortableUnixMailbox(file(mailbox_path)) for msg in mb: toaddr = msg.getaddr('To')[1] addresses[toaddr] = 1 addresses = addresses.keys() addresses.sort() for address in addresses: print address
def test_unix_mbox(self): ### should be better! import email.Parser fname = self.createMessage("cur", True) n = 0 for msg in mailbox.PortableUnixMailbox(open(fname), email.Parser.Parser().parse): n += 1 self.assertEqual(msg["subject"], "Simple Test") self.assertEqual(len(str(msg)), len(FROM_)+len(DUMMY_MESSAGE)) self.assertEqual(n, 1)
def count(fname): fp = open(fname, 'rb') mbox = mailbox.PortableUnixMailbox(fp, get_message) goodcount = 0 badcount = 0 for msg in mbox: if msg["to"] is None and msg["cc"] is None: badcount += 1 else: goodcount += 1 fp.close() return goodcount, badcount
def main(filename, attachment_dir): emails = load_results() num = 0 with open(filename, 'rb') as fp: mb = mailbox.PortableUnixMailbox(fp, factory=email.message_from_file) if not os.path.exists(attachment_dir): os.mkdir(attachment_dir) for message in mb: num += 1 if num - 1 in emails: if num % 1000 == 0: print "skipping", num continue attachments = return_attachments(message) # We only want to store those emails that have attachments if attachments: saved_attachments = [] for a in attachments: content_hash = hashlib.sha256(a['content']).hexdigest() new_location = attachment_dir + '/' + content_hash if not os.path.exists(new_location): with open(new_location, 'wb') as f: f.write(a['content']) saved_attachments.append({ 'filename': convert_to_unicode(a['filename']), 'hash': content_hash }) if message['date']: date = convert_to_unicode(message['date']) else: date = None subject = convert_to_unicode(message['subject']) froms = map(convert_to_unicode, message.get_all('from', 'ignore')) tos = map(convert_to_unicode, message.get_all('to', 'ignore')) emails[num - 1] = { 'attachments': saved_attachments, 'date': date, 'subject': subject, 'froms': froms, 'tos': tos } if num % 1000 == 0: print num save_results(emails) print num
def getFileMessageIds(self, filename): """Gets IDs of messages in the specified mbox file""" try: mbox = self.output.getStream(filename, mode='r') mbox = mbox.files[0] if not mbox: # no valid file exists return [] except IOError: # file does not exist return [] messages = {} # each message i = 0 for message in mailbox.PortableUnixMailbox(mbox): header = '' # We assume all messages on disk have message-ids try: header = ''.join(message.getfirstmatchingheader('message-id')) except KeyError: # No message ID was found. Warn the user and move on self.logger.warn( "Message {id} in {file} has no Message-Id header".format( id=i, file=filename)) header = self.BLANKS_RE.sub(' ', header.strip()) try: msg_id = self.MSGID_RE.match(header).group(1) if msg_id not in messages.keys(): # avoid adding dupes messages[msg_id] = msg_id except AttributeError: # Message-Id was found but could somehow not be parsed by regexp # (highly bloody unlikely) self.logger.warn( 'Message {id} in {file} has a malformed Message-Id header.' .format(id=i, file=filename)) i = i + 1 mbox.close() return messages
def boxparser(self, path, maildir=False, isspool=False): if (not isspool and path == self.mspool or self.ui.mask and self.ui.mask.search(path) is not None): return if maildir: try: dl = os.listdir(path) except OSError: return for d in 'cur', 'new', 'tmp': if d not in dl: return mbox = mailbox.Maildir(path, _msgfactory) else: try: fp = open(path, 'rb') except IOError, inst: self.ui.warn('%s\n' % inst) return mbox = mailbox.PortableUnixMailbox(fp, _msgfactory)
def load_from_file(): if tornado.options.options.init: delete_index() create_index() if tornado.options.options.skip: logging.info("Skipping first %d messages from mbox file" % tornado.options.options.skip) count = 0 upload_data = list() logging.info("Starting import from file %s" % tornado.options.options.infile) mbox = mailbox.PortableUnixMailbox( open(tornado.options.options.infile, 'rb'), email.message_from_file) #//logging.info("mLen: %d" %mailbox.UnixMailbox.__len__()) emailParser = DelegatingEmailParser( [AmazonEmailParser(), SteamEmailParser()]) for msg in mbox: count += 1 if not count % 100: logging.info("Item %d" % count) if count < tornado.options.options.skip: continue item = convert_msg_to_json(msg) if item: upload_data.append(item) if len(upload_data) == tornado.options.options.batch_size: upload_batch(upload_data) upload_data = list() # upload remaining items in `upload_batch` if upload_data: upload_batch(upload_data) logging.info("Import done - total count %d" % count)
def _selectBox(self): # mBox Strict if self.boxtype.get() == self.boxtyps[0]: self.mb = mailbox.UnixMailbox(file(self.mailbox.get(), 'r')) self.Disp(self.boxtype.get(), " at location ", self.mailbox.get(), " Opened Successfully.") # mBox Loose elif self.boxtype.get() == self.boxtyps[1]: self.mb = mailbox.PortableUnixMailbox(file(self.mailbox.get(), 'r')) self.Disp(self.boxtype.get(), " at location ", self.mailbox.get(), " Opened Successfully.") # MailDir elif self.boxtype.get() == self.boxtyps[2]: self.mb = mailbox.Maildir(os.path.dirname(self.mailbox.get())) self.Disp(self.boxtype.get(), " at location ", os.path.dirname(self.mailbox.get()), " Opened Successfully.") # MMDF elif self.boxtype.get() == self.boxtyps[3]: self.mb = mailbox.MmdfMailbox(file(self.mailbox.get(), 'r')) self.Disp(self.boxtype.get(), " at location ", self.mailbox.get(), " Opened Successfully.") # MH elif self.boxtype.get() == self.boxtyps[4]: self.mb = mailbox.MHMailbox(file(self.mailbox.get(), 'r')) self.Disp(self.boxtype.get(), " at location ", self.mailbox.get(), " Opened Successfully.") # Babyl elif self.boxtype.get() == self.boxtyps[5]: self.mb = mailbox.BabylMailbox(file(self.mailbox.get(), 'r')) self.Disp(self.boxtype.get(), " at location ", self.mailbox.get(), " Opened Successfully.") #Unknown File Type else: self.Disp("*** I don't know about that file type.") self.running = 2
def process_mailbox(f, dosa=1, pats=None): gen = email.Generator.Generator(sys.stdout, maxheaderlen=0) for msg in mailbox.PortableUnixMailbox(f, Parser().parse): process_message(msg, dosa, pats) gen.flatten(msg, unixfrom=1)
continue if save_lines: stripped_line = line.strip() if not stripped_line: break full_error_lines.append(stripped_line) return full_error_lines for e in os.listdir(mail_directory): if not re.search(r'^\d+$', e): continue filename = os.path.join(mail_directory, e) with open(filename, 'rb') as fp: # A hack to parse Gnus's nnml format mail folders: raw_email = re.sub(r'^X-From-Line: ', 'From ', fp.read()) for mail in mailbox.PortableUnixMailbox(StringIO.StringIO(raw_email)): dt = dateutil.parser.parse(mail['date']) if dt < since: continue if dt > until: continue if 'subject' not in mail: continue subject = mail['subject'] if not re.search(r'^Mail delivery failed', subject): continue failed_recipient = mail.getheader('X-Failed-Recipients').strip() bounced.add(failed_recipient) print failed_recipient print >> sys.stderr, "\n".join(" " + l for l in get_errors(mail))
#!/usr/bin/python import email import mailbox import sys if len(sys.argv) <2: print("Usage: %s [path to mailbox file]" % sys.argv[0]) sys.exit([1]) path = sys.argv[1] fp = open(path, 'rb') subjects = [] for message in mailbox.PortableUnixMailbox(fp, email.message_from_file): subjects.append(message['Subject']) print('s message(s) in mailbox "%s":' % (len(subjects), path)) for subject in subjects: print('', subject)
def get_mailbox(self, filename): return mailbox.PortableUnixMailbox( open(os.path.join(self.mbox_dir, filename)))
#!/usr/bin/python import email import mailbox import os import sys if len(sys.argv) < 3: print "%s <{mailbox}> <mime-type>" % sys.argv[0] sys.exit(-1) fp = file(sys.argv[1], 'rb') mbox = mailbox.PortableUnixMailbox(fp, email.message_from_file) for i in mbox: if i.is_multipart(): l = i.get_payload() for ii in l: if ii.get_content_type()[:len(sys.argv[2])] == sys.argv[2]: l.remove(ii) print i
eudoraDir = r"t:\data\luc\eudora" def getAttrOrNone(msg, name): if msg.has_key(name): return msg[name] return None if __name__ == "__main__": for fn in os.listdir(eudoraDir): (root, ext) = os.path.splitext(fn) if ext == '.mbx': pfn = os.path.join(eudoraDir, fn) print "\nfound mailbox %s\n" % pfn f = file(pfn) mb = mailbox.PortableUnixMailbox(f, email.Message) count = 0 while True: msg = mb.next() if msg is None: break print getAttrOrNone(msg,'date'),\ getAttrOrNone(msg,'from'),\ getAttrOrNone(msg,'to'),\ getAttrOrNone(msg,'subject') count += 1 print "\n%s contains %d messages\n" % (fn, count)
if not (0 < percent < 100): raise ValueError percent /= 100.0 bin1 = args[2] bin2 = args[3] except IndexError: usage(1, 'Not enough arguments') except ValueError: usage(1, 'Percent argument must be a float between 1.0 and 99.0') # Cruise bin1out = open(bin1, 'wb') bin2out = open(bin2, 'wb') infp = open(mboxfile, 'rb') mbox = mailbox.PortableUnixMailbox(infp, mboxutils.get_message) for msg in mbox: if random.random() < percent: outfp = bin1out else: outfp = bin2out astext = str(msg) assert astext.endswith('\n') outfp.write(astext) outfp.close() bin1out.close() bin2out.close() if __name__ == '__main__':
def getmbox(name): """Return an mbox iterator given a file/directory/folder name.""" if name == "-": return [get_message(sys.stdin)] if name.startswith("+"): # MH folder name: +folder, +f1,f2,f2, or +ALL name = name[1:] import mhlib mh = mhlib.MH() if name == "ALL": names = mh.listfolders() elif ',' in name: names = name.split(',') else: names = [name] mboxes = [] mhpath = mh.getpath() for name in names: filename = os.path.join(mhpath, name) mbox = mailbox.MHMailbox(filename, get_message) mboxes.append(mbox) if len(mboxes) == 1: return iter(mboxes[0]) else: return _cat(mboxes) elif name.startswith(":"): # IMAP mailbox name: # :username:password@server:folder1,...folderN # :username:password@server:port:folder1,...folderN # :username:password@server:ALL # :username:password@server:port:ALL parts = re.compile( ':(?P<user>[^@:]+):(?P<pwd>[^@]+)@(?P<server>[^:]+(:[0-9]+)?):(?P<name>[^:]+)' ).match(name).groupdict() from scripts.sb_imapfilter import IMAPSession, IMAPFolder from spambayes import Stats, message from spambayes.Options import options session = IMAPSession(parts['server']) session.login(parts['user'], parts['pwd']) folder_list = session.folder_list() if name == "ALL": names = folder_list else: names = parts['name'].split(',') message_db = message.Message().message_info_db stats = Stats.Stats(options, message_db) mboxes = [IMAPFolder(n, session, stats) for n in names] if len(mboxes) == 1: return full_messages(mboxes[0]) else: return _cat([full_messages(x) for x in mboxes]) if os.path.isdir(name): # XXX Bogus: use a Maildir if /cur is a subdirectory, else a MHMailbox # if the pathname contains /Mail/, else a DirOfTxtFileMailbox. if os.path.exists(os.path.join(name, 'cur')): mbox = mailbox.Maildir(name, get_message) elif name.find("/Mail/") >= 0: mbox = mailbox.MHMailbox(name, get_message) else: mbox = DirOfTxtFileMailbox(name, get_message) else: fp = open(name, "rb") mbox = mailbox.PortableUnixMailbox(fp, get_message) return iter(mbox)
def get_mailbox(self, filename): return mailbox.PortableUnixMailbox(open(self.mbox_dir + filename))