def index(self, msgs: notmuch.Messages) -> None: """Index messages in the address book""" tot_msg = 0 tot_addr = 0 try: parser = email.parser.Parser() for msg in msgs: msg_fn = msg.get_filename() try: with open(msg_fn, "r") as data: mail = parser.parse(data, True) except UnicodeDecodeError: with open(msg_fn, "r", encoding="latin9") as data: mail = parser.parse(data, True) addrs = [] for hdr in ("from", "to", "cc", "bcc"): addrs += mail.get_all(hdr, []) addrs = email.utils.getaddresses(addrs) tot_addr += self._add(addrs) tot_msg += 1 if (tot_msg % 20) == 0: logging.debug("Messages: %d; addresses: %d", tot_msg, tot_addr) finally: # At the end, save the DB self._merge_db() with open(os.path.expanduser(_DBPATH), "wb") as fout: pickle.dump(self._db, fout, pickle.HIGHEST_PROTOCOL) logging.info( "Total: indexed %d messages and %d addresses. " "%d unique addresses in the address book.", tot_msg, tot_addr, len(self._db), )
def import_mail(conn, fp): msgtxt = StringIO.StringIO() shutil.copyfileobj(fp, msgtxt) msgtxt.seek(0) parser = email.parser.Parser() msg = parser.parse(msgtxt) to = msg['To'] name, addr = email.utils.parseaddr(to) print name, addr mailname = addr.split('@')[0].lower() print mailname c = conn.cursor() c.execute("select id from isp_reports where mailname = %s", [mailname]) row = c.fetchone() if row is None: return report_id = row[0] c.execute("insert into isp_report_emails(report_id, message, created) values (%s,%s,now())", [report_id, msgtxt.getvalue()]) conn.commit()
def add_email_from(self, lines): """Add an address from From: field of a mail. This assumes a single mail file is supplied through. Args: lines: A generator of lines, usually a open file. """ parser = email.parser.HeaderParser() headers = parser.parse(lines) if 'From' not in headers: print "Not a valid mail file!" sys.exit(2) (name, mailaddr) = email.utils.parseaddr(headers['From']) if not name: name = mailaddr else: # This decodes headers like "=?iso-8859-1?q?p=F6stal?=" values = email.header.decode_header(name) if len(values) == 0: # Can't this be possible? name = mailaddr else: # There should be only one element anyway (name, encoding) = values[0] if encoding is not None: name = name.decode(encoding) self.add_mail_contact(name, mailaddr)
def parser_worker(fileQ, loaderQ, debug): try: while True: fileName = fileQ.get() if debug and fileName is not None: print "PARSING: " + fileName if fileName is None: fileQ.task_done() break rootLen = len(sys.argv[1]) if fileName[rootLen] == '/' or fileName[rootLen] == '\\': rootLen += 1 parser = email.parser.Parser() with open(fileName) as f: eFile = fileName[rootLen:] email_msg = parser.parse(f, headersonly=True) if len(email_msg._headers) > 0: t = email_msg._headers[0] if t[1] == 'VCARD': "vcard - skip" elif t[1] == 'VCALENDAR': "vcalendar - skip" else: loaderQ.put( (email_msg, eFile) ) fileQ.task_done() except Exception, e: print e pass
def parseElement(filename,element,type): parser = email.parser.Parser() email_val = parser.parse(open(filename,"r")) element_val=None if element.lower()=="message": if email_val.is_multipart(): for part in email_val.walk(): ctype = part.get_content_type() cdispo = str(part.get('Content-Disposition')) #skip any text/plain (txt) attachments if ctype == 'text/html' and 'attachment' not in cdispo: element_val = part.get_payload(decode=True) #decode break #not multipart -i.e. plain text, no attachments, keeping fingers crossed else: element_val = email_val.get_payload(decode = True) else: element_val=email_val.get_all(element)[0] if element_val!=None: print(element_val.decode()) else: print ("".decode())
def metadata(self): """ Return the contents of the :file:`METADATA` file inside the wheel. """ if self._metadata is None: with zipfile.ZipFile(self.open()) as wheel: filenames = { '{self.package_tag}-' '{self.package_version_tag}.dist-info/' 'METADATA'.format(self=self), '{self.package_canon}-' '{self.package_version_tag}.dist-info/' 'METADATA'.format(self=self), } for filename in filenames: try: with wheel.open(filename) as metadata: parser = email.parser.BytesParser() self._metadata = parser.parse(metadata) except KeyError: pass else: break if self._metadata is None: raise RuntimeError( 'Unable to locate METADATA in %s; attempted: %r; ' 'possible files: %r' % (self.wheel_file, filenames, { info.filename for info in wheel.infolist() if info.filename.endswith('METADATA') })) return self._metadata
def extract_xml(source: Union[str, TextIO]) -> io.StringIO: """Takes an SMTP message with a single attachment, extracts it, and returnsit as a file-like object. Handles multipart mime messages (yahoo, others...) as well as Google's minimalist application/zip messages""" if not hasattr(source, 'read'): source = open(source, 'r') parser = email.parser.Parser(policy=email.policy.default) email_msg = parser.parse(source) source.close() if email_msg.get_content_type() == 'application/zip': # google zip_data = email_msg.get_content() zf = zipfile.ZipFile(io.BytesIO(zip_data)) filenames = zf.namelist() if len(filenames) != 1: cnt = len(filenames) raise RuntimeError( f'Not exactly one file in attached zip file ({cnt} found)') xml = zf.read(filenames[0]) xml_fd = io.StringIO(xml.decode()) else: attachments = list(email_msg.iter_attachments()) if not len(attachments) == 1: cnt = len(attachments) raise RuntimeError( f'Not exactly one attachment in mail ({cnt} found)') compressed_data = attachments[0].get_content() xml_fd = io.StringIO(gzip.decompress(compressed_data).decode()) return xml_fd
def process(git_patch_file): parser = email.parser.Parser() msg = parser.parse(git_patch_file) from_hdr = clean_header(msg['From']) commit_title = clean_header(msg['subject']) if not len(commit_title) or not len(from_hdr): sys.stderr.write("%s does not look like a valid git patch file, skipping\n" % git_patch_file.name) return parsed_from = email.utils.parseaddr(from_hdr) nuke_prefix = r"\[PATCH( \d+/\d+)?\] " match = re.match(nuke_prefix, commit_title) if match: commit_title = commit_title[match.end():] patch_body = msg.get_payload() # git format-patch wraps the diff (including trailing whitespace): # --- # <diff> # -- # 2.0.3 # This doesn't hurt parsing the diff at all, but the version number is # nonsense once the git specific items have been stripped patch_body = re.sub(r'--\s?\n[0-9\.]+\n$', '', patch_body) return '\n'.join(['# HG changeset patch', '# User %s <%s>' % parsed_from, '', commit_title, '', patch_body])
def findUrl(path): import traceback parser = email.parser.Parser() try: file_pointer = open(path, "r", encoding="utf-8", errors="ignore") except FileNotFoundError: print("No such file") EmailMessage = parser.parse(file_pointer, headersonly=False) content = "" try: content = recursivePayloadSearch(EmailMessage) # content = removehtmltag(removecsstag(content)) except Exception: print("Get Content Error: %s" % (path)) try: # blockPrint() trypayload = EmailMessage._payload pure_b64 = pure_b64decode(trypayload) if len(pure_b64) > 0.6 * len(EmailMessage._payload): # print(path) content = pure_b64 except Exception: pass return content
def process_new_email(path, threads_index): with open(path, "r") as fd: parser = email.parser.HeaderParser() email_headers = parser.parse(fd) subject = email_headers["subject"] from_field = {} from_field["name"], from_field["address"] = email.utils.parseaddr(email_headers["From"]) to_field = {} to_field["addresses"] = email.utils.getaddresses(email_headers["to"]) if subject != None: subject = headers.cleanup_subject(subject) thread = None for index, thr in enumerate(threads_index): if thr["subject"] == subject: thread = threads_index.pop(index) break if not thread: # create a new thread thread = threads.create_thread_structure() thread["subject"] = subject thread["creator"] = from_field msg_id = os.path.basename(path) thread["messages"].append(msg_id) thread["date"] = datetime.datetime.utcnow() thread["unread"] = True if from_field["address"] != thread["creator"]["address"]: thread["lastreplyfrom"] = from_field threads_index.insert(0, thread)
def egg_info(files: Dict[str, str]) -> Tuple[Message, Distribution]: # TODO consider # https://docs.python.org/3/distutils/apiref.html#distutils.core.run_setup # and whether that gives a Distribution that knows setuptools-only options with tempfile.TemporaryDirectory() as d: for relname, contents in files.items(): Path(d, relname).parent.mkdir(exist_ok=True, parents=True) Path(d, relname).write_text(contents) try: cwd = os.getcwd() stdout = sys.stdout os.chdir(d) sys.stdout = io.StringIO() dist = run_setup(f"setup.py", ["egg_info"]) finally: os.chdir(cwd) sys.stdout = stdout sources = list(Path(d).rglob("PKG-INFO")) assert len(sources) == 1 with open(sources[0]) as f: parser = email.parser.Parser() info = parser.parse(f) reader = SetuptoolsReader(Path(d)) dist = reader.get_metadata() return info, dist
def entry_list(request, root_dir): # Get list of entries # TODO. Split in to separate function. # TODO. Cache entries = [] for dir_name, subdirs, files in os.walk(root_dir): for file_name in files: m = entry_file_name_re.match(file_name) if m: entries.append(m.groups() + (dir_name, file_name)) entries.sort() # Most recent last. entry = entries[-1] y, m, d, slug, suffix, dir_name, file_name = entry with open(os.path.join(dir_name, file_name), 'rb') as input: msg = parser.parse(input) template_args = { 'title': msg['title'], 'body': msg.get_payload(), 'published': datetime.datetime(int(y), int(m, 10), int(d, 10), 12, 0, 0), } # Let’s add some links from a resource library. lib = get_library_or_404(settings.SPREADLINKS_DIR, 'spreadsite') links = lib.all_links template_args['links'] = links return render_to_response('downblog/entry_list.html', template_args, RequestContext(request))
def index(self, msgs): tot_msg = 0 tot_addr = 0 try: parser = email.parser.Parser() for msg in msgs: fn = msg.get_filename() with open(fn, "r") as f: mail = parser.parse(f, True) addrs = [] for hdr in ("from", "to", "cc", "bcc"): addrs += mail.get_all(hdr, []) addrs = email.utils.getaddresses(addrs) tot_addr += self._add(addrs) tot_msg += 1 if (tot_msg % 20) == 0: logging.debug("Messages: %d; addresses: %d" % (tot_msg, tot_addr)) finally: # At the end, save the DB self._merge_db() with open(os.path.expanduser(_DBPATH), "wb") as f: pickle.dump(self._db, f, pickle.HIGHEST_PROTOCOL) logging.info( "Total: indexed %d messages and %d addresses. %d unique addresses in the address book." % (tot_msg, tot_addr, len(self._db)))
def process_new_email(path, threads_index): with open(path, "r") as fd: parser = email.parser.HeaderParser() email_headers = parser.parse(fd) subject = email_headers["subject"] from_field = {} from_field["name"], from_field["address"] = email.utils.parseaddr( email_headers["From"]) to_field = {} to_field["addresses"] = email.utils.getaddresses(email_headers["to"]) if subject != None: subject = headers.cleanup_subject(subject) thread = None for index, thr in enumerate(threads_index): if thr["subject"] == subject: thread = threads_index.pop(index) break if not thread: # create a new thread thread = threads.create_thread_structure() thread["subject"] = subject thread["creator"] = from_field msg_id = os.path.basename(path) thread["messages"].append(msg_id) thread["date"] = datetime.datetime.utcnow() thread["unread"] = True if from_field["address"] != thread["creator"]["address"]: thread["lastreplyfrom"] = from_field threads_index.insert(0, thread)
def convertDate(date): from time import struct_time import datetime from dateutil import parser date = parser.parse(date) date = date.timetuple() return date
def handle_check(self): parser = email.parser.BytesParser(policy=email.policy.SMTP) msg = parser.parse(self.rfile) digest = pyzor.digest.DataDigester(msg).value check = pyzor.client.Client().check(digest) self.write_json({k: v for k, v in check.items()})
def iter_package_entries(self, name: str) -> Iterator[PackageEntry]: for version, url in self.collect_best_dist_urls(name).items(): http_file = cast(IO[bytes], HttpFile(url, self.session)) with zipfile.ZipFile(http_file) as zf: with _open_metadata(zf, name) as f: parser = email.parser.BytesParser() data = parser.parse(cast(BinaryIO, f), headersonly=True) dependencies: List[str] = data.get_all("Requires-Dist", []) yield PackageEntry(version, dependencies)
def __init__(self, file): if sys.version_info[0] < 3: parser = email.parser.Parser() else: parser = email.parser.BytesParser() message = parser.parse(file, headersonly=True) if sys.version_info[0] < 3: return mailbox.MaildirMessage.__init__(self, message) else: return super().__init__(message)
def handle_check(self): parser = email.parser.BytesParser(policy=email.policy.SMTP) msg = parser.parse(self.rfile) digest = pyzor.digest.DataDigester(msg).value # whitelist 'default' digest (all messages with empty/short bodies) if digest != 'da39a3ee5e6b4b0d3255bfef95601890afd80709': check = pyzor.client.Client().check(digest) self.write_json({k: v for k, v in check.items()})
def test_customize_message_encoding(self): mailing = factories.MailingFactory( header="""Content-Transfer-Encoding: 7bit Content-Type: multipart/alternative; boundary="===============2840728917476054151==" Subject: Great news! From: Mailing Sender <*****@*****.**> To: <*****@*****.**> Date: Wed, 05 Jun 2013 06:05:56 -0000 """, body=""" This is a multi-part message in MIME format. --===============2840728917476054151== Content-Type: text/plain; charset="iso-8859-1" MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable This is a very simple mailing. I'm happy. --===============2840728917476054151== Content-Type: text/html; charset="iso-8859-1" MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"> <html><head> <META http-equiv=3DContent-Type content=3D"text/html; charset=3Diso-8859-1"> </head> <body> This is <strong> a very simple</strong> <u>mailing</u>. = I'm happy! Nothing else to say... </body></html> --===============2840728917476054151==-- """ ) recipient = factories.RecipientFactory(mailing=mailing) customizer = MailCustomizer(recipient) fullpath = os.path.join(customizer.temp_path, MailCustomizer.make_file_name(recipient.mailing.id, recipient.id)) if os.path.exists(fullpath): os.remove(fullpath) self.assertFalse(os.path.exists(fullpath)) customizer._run_customizer() self.assertTrue(os.path.exists(fullpath)) parser = email.parser.Parser() message = parser.parse(file(fullpath, 'rt'), headersonly = False) assert(isinstance(message, email.message.Message)) self.assertTrue(message.is_multipart()) self.assertEquals("multipart/alternative", message.get_content_type()) self.assertEquals("text/plain", message.get_payload(i=0).get_content_type()) self.assertEquals("text/html", message.get_payload(i=1).get_content_type()) self.assertEquals(message.get_payload(i=0).get_payload(decode=True), "This is a very simple mailing. I'm happy.") self.assertIn("This is <strong> a very simple</strong> <u>mailing</u>. I'm happy! ", message.get_payload(i=1).get_payload(decode=True))
def move_files(from_where): valid_names=[] COUNTY_NAMES = ["wilkes","polk","vance","transylvania","person","nash","chatham","mcdowell","lincoln","lenoir","dare","camden","caldwell","chowan","jackson","alexander","poke","montogmery","hoke","duplin","columbus","randoplh"] print COUNTY_NAMES COUNTY_NAMES = COUNTY_NAMES+ valid_names print COUNTY_NAMES parser = email.parser.Parser() global COUNTY, OUTPUT_PATH,COUNTY_EXTENSION output_directory = OUTPUT_PATH if from_where == "spam": input_directory = output_directory + "/spam" else: input_directory = output_directory + "/unsure" # You know the input directory. from the input specified to this function. # Loop through all the files of the input directory.. for path,subdirs,files in os.walk(input_directory): for filename in files: # First initialize valid to zero. f_name = os.path.join(path,filename) print f_name replaced_fname = f_name.replace(" ","\ ") valid =0 emaildata= parser.parse(open(f_name,"r")) subject = obtainSubjectEmail(emaildata) sender = obtainFromEmail(emaildata) if subject == None or sender == None: continue subject = subject[0].lower() sender = sender[0] print subject,sender,COUNTY_EXTENSION if (COUNTY_EXTENSION in sender) or "re:" in subject or "MAILER-DAEMON" in sender or "fw" in subject or "county" in sender or "@" not in sender: print subject,sender,COUNTY_EXTENSION,valid valid =1 # If sender is from nash county . Make the valid 1 # If the mail was a reply or forward make the valid 1 for name in COUNTY_NAMES: if name in sender: valid=1 break print valid if valid: print "Found" if from_where == "spam": command = "bogofilter -Sn -v < " + replaced_fname #os.system(command) print command elif from_where == "unsure": command = "bogofilter -n -v < " + replaced_fname #os.system(command) print command command = "mv " + replaced_fname + " " + output_directory + "/ham/" print command os.system(command)
def all_messages(self): parser = email.parser.Parser() for path in self._all_files: try: f = codecs.open(path, 'r') msg = parser.parse(f) f.close() yield (path, msg) except IsADirectoryError: pass except UnicodeDecodeError as e: if not self._try_latin1: logging.error("Couldn't decode %s - error: %s"%(path, e)) else: logging.info("HACK! failed with UTF-8, trying with latin-1...") try: f = codecs.open(path, 'r', encoding='latin-1') msg = parser.parse(f) f.close() yield(path, msg) except UnicodeDecodeError as ee: logging.error("Couldn't decode %s, even after trying with latin-1 Hack, Unicode: %s"%(path, ee))
def handle_check(self): parser = email.parser.BytesParser(policy=email.policy.SMTP) msg = parser.parse(self.rfile) servers = pyzor.config.load_servers("/root/.pyzor/servers") # log = "/tmp/pyzor.log" # logging.basicConfig(filename=log,level=logging.DEBUG,format='%(asctime)s %(message)s', datefmt='%d/%m/%Y %H:%M:%S') # logging.info(servers) digest = pyzor.digest.DataDigester(msg).value check = pyzor.client.Client().check(digest, address=servers[0]) self.write_json({k: v for k, v in check.items()})
def process_mails(rootdirs, fields = ['Date','From'], ratio=1.0): parser = email.parser.Parser() for rootdir in rootdirs: for path, dirs, files in os.walk(rootdir): print path for fname in files: if not random.random() < ratio: continue with open(os.path.join(path, fname)) as f: hdrs = parser.parse(f, True) body = hdrs.get_payload() if '---' in body: body = body[:body.index('---')] yield [hdrs[field] for field in fields] + [body]
def do(self): try: fp = open(self.eml_file, 'r') parser = email.parser.HeaderParser() self.eml_msg = parser.parse(fp) except Exception as e: pass return Job.do(self)
def metadata(self): """ Return the contents of the :file:`METADATA` file inside the wheel. """ if self._metadata is None: with zipfile.ZipFile(self.wheel_file.open('rb')) as wheel: filename = ('{self.package_tag}-' '{self.package_version_tag}.dist-info/' 'METADATA'.format(self=self)) with wheel.open(filename) as metadata: parser = email.parser.BytesParser() self._metadata = parser.parse(metadata) return self._metadata
def generator(): time_thresh = (datetime.now() - timedelta(days=1)).timestamp() parser = email.parser.BytesHeaderParser() for match in glob.iglob(spam_glob): timestamp = os.stat(match)[stat.ST_CTIME] if timestamp > time_thresh: with gzip.open(match) as fh: res = parser.parse(fh) yield ns_dict({ 'to': res['To'], 'frm': res['From'], 'subj': res['Subject'], 'id': res['X-Quarantine-ID'], 'score': res['X-Spam-Score'], 'time': timestamp })
def process_mails(outfile, rootdir): count = 0 # for progress reporting... skipped_no_to = 0 skipped_x_to = 0 parser = email.parser.Parser() with open(outfile, "wb") as outf: csvwriter = csv.writer(outf) for path, dirs, files in os.walk(rootdir): for fname in files: with open(os.path.join(path, fname)) as f: hdrs = parser.parse(f, True) mailpath = os.path.join(path[len(rootdir):], fname) outelems = [] outelems.append(mailpath) outelems.append(hdrs['Date']) outelems.append(hdrs['Message-ID']) outelems.append(hdrs['From']) # for each of the "to" fields, add a separate row for each # recipient, and the "to-type" written = False for to in ['To', 'Cc', 'Bcc']: if hdrs[to]: for rcpt in hdrs[to].split(','): csvwriter.writerow(outelems + [rcpt.strip(), to]) written = True if not written: # is this a company-wide email? if hdrs['X-to'] and "All Enron Worldwide" in hdrs['X-To']: csvwriter.writerow(outelems + ['All Enron Worldwide', 'X-To']) else: if hdrs['X-To'] or hdrs['X-cc'] or hdrs['X-bcc']: xtos = hdrs['X-To'] + hdrs['X-cc'] + hdrs['X-bcc'] print "Skipping %s, has X-to fields: %s" % (mailpath, xtos) skipped_x_to += 1 else: skipped_no_to += 1 # progress reporting count += 1 if count % 100 == 0: print count, "(skipped: %d,%d)" % (skipped_x_to, skipped_no_to)
def main(): username = None try: # EmailMessage doesn't exist before 3.4 # parser = email.parser.Parser(_class=email.message.EmailMessage) parser = email.parser.Parser() msg = parser.parse(sys.stdin) username = get_username(msg) if not username: raise MailprintError( 'could not identify sender: {} | {} | {}'.format( msg.get_unixfrom(), msg.get('Sender'), msg.get('From'))) print('[{}] incoming message from {}'.format(datetime.datetime.now(), username), file=sys.stderr) subject = msg.get('Subject') if not subject: subject = '' spooled_file = False for part in msg.walk(): name = part.get_filename() if not name: continue mimetype = part.get_content_type() if (mimetype not in TYPE_WHITELIST and part.get_content_maintype() != 'text'): send_zephyr( [username], 'error', 'file ' + part.get_filename() + ' has illegal type ' + mimetype + '\nplease send a text file, ' + 'or a file with type:\n' + '\n'.join(TYPE_WHITELIST)) continue pdf = mimetype == 'application/pdf' spool_file(name, part.get_payload(decode=True), username, pdf, 'color' in subject) send_zephyr([username], 'info', 'Spooled file: ' + name) spooled_file = True if not spooled_file: send_zephyr([username], 'error', 'Your print request with subject:\n' + subject + '\nwas received, but had no printable attachments.') except MailprintError as e: e.send_zephyr() raise except Exception: zephyr_error() raise
def parse_email(filepath): fileobj = open(filepath, "r") parser = email.parser.Parser() emailobj = parser.parse(fileobj) fileobj.close() revised_parsed_email = dict() revised_parsed_email['message_id'] = emailobj['Message-ID'].decode('cp1252').encode('utf-8') revised_parsed_email['subject'] = emailobj['Subject'].decode('cp1252').encode('utf-8') revised_parsed_email['date'] = emailobj['Date'].decode('cp1252').encode('utf-8') revised_parsed_email['from'] = emailobj['From'].decode('cp1252').encode('utf-8') revised_parsed_email['to'] = parse_email_addresses(emailobj['To']) revised_parsed_email['cc'] = parse_email_addresses(emailobj['Cc']) revised_parsed_email['bcc'] = parse_email_addresses(emailobj['Bcc']) revised_parsed_email['body'] = emailobj.get_payload().decode('cp1252').encode('utf-8') return revised_parsed_email
def parseElement(filename,element): parser = email.parser.Parser() email_val = parser.parse(open(filename,"r")) element_val=None if element.lower()=="message": while True: try: email_val = email_val.get_payload(0) except: break element_val = email_val.get_payload() else: element_val=email_val.get_all(element)[0] if element_val!=None: print element_val else: print ""
def get_digest(source_file_names: Sequence[Path], block_size: int = 8192) -> Optional[str]: """Return a SHA256 hash composed from the content of all source files. Args: source_file_names: A sequence of source file paths Returns: A SHA256 hash composed from the content of all source files.""" # See the PEP-376 RECORD file specification: <https://www.python.org/dev/peps/pep-0376/#record> package_record_pattern = re.compile(r'\.dist-info/RECORD$') egg_information_pattern = re.compile(r'\.egg-info/PKG-INFO$') digest = hashlib.sha256() full = set(source_file_names) done = set() if not full: return None for source_file_name in sorted(full): if package_record_pattern.search(str(source_file_name)): package_parent_path = source_file_name.parent.parent with open(source_file_name, 'r', buffering=block_size) as record: reader = csv.reader(record, delimiter=',', quotechar='"', lineterminator=os.linesep) for item in reader: item_name, item_hash, _other = item[:3] source_file_name = package_parent_path / item_name if item_hash and source_file_name in full: digest.update((str(item_name) + item_hash).encode()) done.add(source_file_name) remaining = full - done for source_file_name in sorted(remaining): with open(source_file_name, 'rb', buffering=block_size) as source_file: if egg_information_pattern.search(str(source_file_name)): # Ensure deterministic field order from PKG-INFO files # See: https://www.python.org/dev/peps/pep-0314/#including-metadata-in-packages parser = email.parser.BytesHeaderParser( policy=email.policy.default) source_headers = sorted(parser.parse(source_file).items()) for header, value in source_headers: digest.update(header.encode()) digest.update(value.encode()) else: digest.update(source_file.read()) return digest.hexdigest()
def __init__(self, config, sendmail, *a, **kw): """ Args: config: ConfigParser object holding configuration for the Mailing Set SMTP server. sendmail: A function with the same signature as smtp.sendmail which will be called to send outgoing messages. Test code uses this to check assertions on the outgoing messages. Production code passes in smtp.sendmail itself. """ smtp.SMTPFactory.__init__(self, *a, **kw) self.config = config self.sendmail = sendmail # Cache list definitions and use them to parse destination addresses resolver = MailingSetState(self.config) self.parse = lambda address: parser.parse(resolver, address)
def git_am_patch_split(f): """Parse a git-am-style patch and split it up into bits. :param f: File-like object to parse :return: Tuple with commit object, diff contents and git version """ parser = email.parser.Parser() msg = parser.parse(f) c = Commit() c.author = msg["from"] c.committer = msg["from"] try: patch_tag_start = msg["subject"].index("[PATCH") except ValueError: subject = msg["subject"] else: close = msg["subject"].index("] ", patch_tag_start) subject = msg["subject"][close+2:] c.message = subject.replace("\n", "") + "\n" first = True body = BytesIO(msg.get_payload()) for l in body: if l == "---\n": break if first: if l.startswith("From: "): c.author = l[len("From: "):].rstrip() else: c.message += "\n" + l first = False else: c.message += l diff = "" for l in body: if l == "-- \n": break diff += l try: version = next(body).rstrip("\n") except StopIteration: version = None return c, diff, version
def generate_single_mbox(conn, listid, year, month, destination): curs = conn.cursor() curs.execute("SELECT id, rawtxt FROM messages m INNER JOIN list_threads t ON t.threadid=m.threadid WHERE hiddenstatus IS NULL AND listid=%(listid)s AND date>=%(startdate)s AND date <= %(enddate)s ORDER BY date", { 'listid': listid, 'startdate': date(year, month, 1), 'enddate': date(year, month, calendar.monthrange(year, month)[1]), }) with open(destination, 'w', encoding='utf8') as f: for id, raw, in curs: s = BytesIO(raw) parser = email.parser.BytesParser(policy=email.policy.compat32) msg = parser.parse(s) try: x = msg.as_string(unixfrom=True) f.write(x) except UnicodeEncodeError as e: print("Not including {0}, unicode error".format(msg['message-id'])) except Exception as e: print("Not including {0}, exception {1}".format(msg['message-id'], e))
def eatfiles(dirname, outfile): for dirpath, dirs, files in os.walk(dirname): for filename in files: #this is bs needs to be any file except a directory, I think?? with open(os.path.join(dirpath, filename)) as infile: try: parser = email.parser.Parser() msg = parser.parse(infile, True) # skip things that were prior to 2006 year = email.utils.parsedate(msg.get('date'))[0] if year < 2006: continue # include all address fields...maybe too many? Depends # on how there are used in real life addrFields = [] tos = msg.get_all('to',[]) fos = msg.get_all('from',[]) ccs = msg.get_all('cc',[]) bccs = msg.get_all('bcc',[]) resenttos = msg.get_all('resent-to',[]) resentfos = msg.get_all('resent-from',[]) resentccs = msg.get_all('resent-cc',[]) resentbccs = msg.get_all('resent-bcc',[]) # generate tuples from header fields newaddrs = email.utils.getaddresses(tos+fos+ccs+bccs+resenttos+resentfos+resentccs+resentbccs) #switch tuple order newaddrs = [(t[1], t[0]) for t in newaddrs] # filter out addresses that are lists, doemail, or already know longer names newaddrs = [addr for addr in newaddrs if not infolosing(addr)] # merge emailNameMap.update(newaddrs) except: # if there was a non-email file sitting there, try to ekip it and continue print 'Problem parsing', dirpath + filename + '; Skipping file...' continue of = open(outfile, 'w') for key, value in emailNameMap.iteritems(): of.write(value + ' , ' + key + '\n')
def parse_path(path, stop_words, filter_addr_fn = None): emails = [] if os.path.isdir(path): for filename in os.listdir(path): emails += parse_path(os.path.join(path, filename), stop_words, filter_addr_fn = filter_addr_fn) else: filename = os.path.basename(path) if not file_re.match(filename): if __DEBUG__: print >> sys.stderr, "File " + filename + " is not in the expected \d+. filename format. Skipping..." else: e = parser.parse(open(path)) e = process_email(e, stop_words, filter_addr_fn = filter_addr_fn) if e != None: e['file'] = path emails = [e] return emails
def main(): global opts opts = parse_args() flags = set() if opts.only_html: flags.add('only-html') parser = email.parser.Parser() msg = parser.parse(sys.stdin) try: msg = process_message(msg, flags=flags) except InvalidInputMessage: pass except Exception, detail: if not opts.force: raise else: print >>sys.stderr, 'ERROR: %s' % detail
def test_customize_message(self): mailing = factories.MailingFactory() recipient = factories.RecipientFactory(mailing=mailing) customizer = MailCustomizer(recipient) fullpath = os.path.join(customizer.temp_path, MailCustomizer.make_file_name(recipient.mailing.id, recipient.id)) if os.path.exists(fullpath): os.remove(fullpath) self.assertFalse(os.path.exists(fullpath)) customizer._run_customizer() self.assertTrue(os.path.exists(fullpath)) # print file(fullpath, 'rt').read() parser = email.parser.Parser() message = parser.parse(file(fullpath, 'rt'), headersonly = False) assert(isinstance(message, email.message.Message)) self.assertFalse(message.is_multipart()) self.assertTrue('Date' in message) self.assertEquals('This is a very simple mailing.', message.get_payload())
def parse_email(filepath): fileobj = open(filepath, "r") parser = email.parser.Parser() emailobj = parser.parse(fileobj) fileobj.close() revised_parsed_email = dict() revised_parsed_email['message_id'] = emailobj['Message-ID'].decode( 'cp1252').encode('utf-8') revised_parsed_email['subject'] = emailobj['Subject'].decode( 'cp1252').encode('utf-8') revised_parsed_email['date'] = emailobj['Date'].decode('cp1252').encode( 'utf-8') revised_parsed_email['from'] = emailobj['From'].decode('cp1252').encode( 'utf-8') revised_parsed_email['to'] = parse_email_addresses(emailobj['To']) revised_parsed_email['cc'] = parse_email_addresses(emailobj['Cc']) revised_parsed_email['bcc'] = parse_email_addresses(emailobj['Bcc']) revised_parsed_email['body'] = emailobj.get_payload().decode( 'cp1252').encode('utf-8') return revised_parsed_email
def fuzz(parser): with open(sys.argv[1], "rb") as fp: message = parser.parse(fp) message.as_bytes(policy=email.policy.default) message.is_multipart() message.get_unixfrom() keys = message.keys() for key in keys: message.get(key) message.get_all(key) message.values() message.get_content_type() message.get_content_maintype() message.get_content_subtype() message.get_default_type() message.get_filename() message.get_boundary() message.get_content_charset() message.is_attachment() message.get_content_disposition() for part in message.walk(): pass
def __init__(self, fromlines=None, fromstring=None, fromfile=None): #self.log = Logger() self.recipient = None self.received_by = None self.received_from = None self.received_with = None self.__raw = None parser = email.parser.Parser() # Message is instantiated with fromlines for POP3, fromstring for # IMAP (both of which can be badly-corrupted or invalid, i.e. spam, # MS worms, etc). It's instantiated with fromfile for the output # of filters, etc, which should be saner. if fromlines: try: self.__msg = parser.parsestr(os.linesep.join(fromlines)) except email.errors.MessageError as o: self.__msg = corrupt_message(o, fromlines=fromlines) self.__raw = os.linesep.join(fromlines) elif fromstring: try: self.__msg = parser.parsestr(fromstring) except email.errors.MessageError as o: self.__msg = corrupt_message(o, fromstring=fromstring) self.__raw = fromstring elif fromfile: try: self.__msg = parser.parse(fromfile) except email.errors.MessageError as o: # Shouldn't happen self.__msg = corrupt_message(o, fromstring=fromfile.read()) # fromfile is only used by getmail_maildir, getmail_mbox, and # from reading the output of a filter. Ignore __raw here. else: # Can't happen? raise SystemExit('Message() called with wrong arguments') self.sender = address_no_brackets(self.__msg['return-path'] or 'unknown')
def __init__(self, path, dependencies=None): self.wheel_file = path self._filesize = path.stat().st_size self._filehash = None if dependencies is None: dependencies = {} self._dependencies = dependencies self._parts = list(path.stem.split('-')) # Fix up retired tags (noabi->none) if self._parts[-2] == 'noabi': self._parts[-2] = 'none' # We read metadata now rather than lazily evaluating it to ensure that # we can report corrupt (or invalid) wheels upon construction rather # than waiting to find out later when metadata is queried with zipfile.ZipFile(self.open()) as wheel: filenames = ( '{self.package_tag}-{self.package_version_tag}.dist-info/' 'METADATA'.format(self=self), '{self.package_canon}-{self.package_version_tag}.dist-info/' 'METADATA'.format(self=self), ) for filename in filenames: try: with wheel.open(filename) as metadata: parser = email.parser.BytesParser() self._metadata = parser.parse(metadata) except KeyError: pass else: break else: raise BadWheel( 'Unable to locate METADATA in %s; attempted: %r; ' 'possible files: %r' % (self.wheel_file, filenames, { info.filename for info in wheel.infolist() if info.filename.endswith('METADATA') }))
def index(self, msgs): tot_msg = 0 tot_addr = 0 try: parser = email.parser.Parser() for msg in msgs: fn = msg.get_filename() with open(fn, "r") as f: mail = parser.parse(f, True) addrs = [] for hdr in ("from", "to", "cc", "bcc"): addrs += mail.get_all(hdr, []) addrs = email.utils.getaddresses(addrs) tot_addr += self._add(addrs) tot_msg += 1 if (tot_msg % 20) == 0: logging.debug("Messages: %d; addresses: %d" % (tot_msg, tot_addr)) finally: # At the end, save the DB self._merge_db() with open(os.path.expanduser(_DBPATH), "wb") as f: pickle.dump(self._db, f, pickle.HIGHEST_PROTOCOL) logging.info("Total: indexed %d messages and %d addresses. %d unique addresses in the address book." % (tot_msg, tot_addr, len(self._db)))
def generator(): time_thresh = (datetime.now() - timedelta(days=1)).timestamp() emlparser = email.parser.BytesHeaderParser() # loop through all objects in the quarantine for match in glob.iglob(spam_glob): timestamp = os.stat(match)[stat.ST_CTIME] # compare the timestamp against time treshold if timestamp > time_thresh: # check if file is gzipped if '.gz' in pathlib.Path(match).suffixes: # open gzip file handle with gzip.open(match, 'rb') as gh: res = emlparser.parse(gh) yield ns_dict({ 'date': parser.parse(res['Date']), 'to': res['To'], 'frm': res['From'], 'subj': res['Subject'], 'id': match.split("virusmails/")[1], 'score': res['X-Spam-Score'], 'xto': res['X-Envelope-To'], 'time': timestamp })
def from_maildir(cls, path): # Parse the message itself parser = cls.msg_parser() with open(path, 'rb') as f: s = os.fstat(f.fileno()) timestamp = s.st_mtime msg = parser.parse(f) # Load the metadata from the file name parent, basename = os.path.split(path) parts = basename.split(':', 1) if len(parts) > 1: info = parts[1] else: info = '' flags = set() if info.startswith('2,'): if 'P' in info: flags.add(cls.FLAG_FORWARDED) if 'R' in info: flags.add(cls.FLAG_REPLIED_TO) if 'S' in info: flags.add(cls.FLAG_SEEN) if 'T' in info: flags.add(cls.FLAG_DELETED) if 'D' in info: flags.add(cls.FLAG_DRAFT) if 'F' in info: flags.add(cls.FLAG_FLAGGED) subdir = os.path.basename(parent) if subdir == 'new': flags.add(cls.FLAG_NEW) custom_flags = set() return cls(msg, timestamp, flags, custom_flags)
def test_customize_simple_message_with_recipient_attachment(self): recipient = factories.RecipientFactory( contact_data={ 'email': '*****@*****.**', 'custom': 'very simple', 'attachments': [ { 'filename': "export.csv", 'data': base64.b64encode("col1;col2;col3\nval1;val2;val3\n"), 'content-type': 'text/plain', 'charset': 'us-ascii', }, ] } ) #factories.MailingContentFactory(mailing=recipient.mailing) #print recipient.mailing.content customizer = MailCustomizer(recipient) fullpath = os.path.join(customizer.temp_path, MailCustomizer.make_file_name(recipient.mailing.id, recipient.id)) if os.path.exists(fullpath): os.remove(fullpath) self.assertFalse(os.path.exists(fullpath)) customizer._run_customizer() self.assertTrue(os.path.exists(fullpath)) parser = email.parser.Parser() message = parser.parse(file(fullpath, 'rt'), headersonly = False) assert(isinstance(message, email.message.Message)) self.assertTrue(message.is_multipart()) # print # print message.as_string() self.assertEquals(message.get_payload(i=0).get_payload(), 'This is a very simple mailing.') self.assertEquals(message.get_payload(i=1).get_payload(), 'col1;col2;col3\nval1;val2;val3\n')
def generate_single_mbox(conn, listid, year, month, destination): curs = conn.cursor() curs.execute( "SELECT id, rawtxt FROM messages m INNER JOIN list_threads t ON t.threadid=m.threadid WHERE hiddenstatus IS NULL AND listid=%(listid)s AND date>=%(startdate)s AND date <= %(enddate)s ORDER BY date", { 'listid': listid, 'startdate': date(year, month, 1), 'enddate': date(year, month, calendar.monthrange(year, month)[1]), }) with open(destination, 'w', encoding='utf8') as f: for id, raw, in curs: s = BytesIO(raw) parser = email.parser.BytesParser(policy=email.policy.compat32) msg = parser.parse(s) try: x = msg.as_string(unixfrom=True) f.write(x) except UnicodeEncodeError: print("Not including {0}, unicode error".format( msg['message-id'])) except Exception as e: print("Not including {0}, exception {1}".format( msg['message-id'], e))
def run(self): while True: while len(events_queue) != 0: event = events_queue.pop(0) if event["type"] == "create": try: print event["path"] with open(event["path"], "r") as fd: parser = email.parser.HeaderParser() email_headers = parser.parse(fd) subject = email_headers.get("Subject") print "Subject: %s" % subject if subject != None: subject = headers.cleanup_subject(subject) if subject in threads_index.data: threads_index.data[subject].append(event["path"]) else: threads_index.data[subject] = [event["path"]] except IOError: # Postfix/Dovecot creates temporary files. Ignore them pass time.sleep(EVENTS_QUEUE_PROCESSING_DELAY)
def msg(self): """Return an RFC 2822 parsed message instance""" with open(os.path.join(self.dir_name, self.file_name), 'rb') as strm: return parser.parse(strm)
# Parses each file from the Enron email dataset and produces a tab separated # From and To email address tuples. Multiple recipients in the To: header are # written out as multiple lines of output. import email.parser import os import re import sys def remove_special_chars(s): return re.sub(r"[<>\"' ]", "", s) fname = sys.argv[1] if os.path.isfile(fname) and fname.endswith("."): fin = open(sys.argv[1], 'rb') parser = email.parser.HeaderParser() msg = parser.parse(fin, headersonly=True) fin.close() try: from_value = msg["From"] to_values = msg["To"].replace("\r\n", "").replace("\t", "").split(", ") if from_value != None and to_values != None: from_value = remove_special_chars(from_value) for to_value in to_values: to_value = remove_special_chars(to_value) print("%s\t%s" % (from_value, to_value)) except AttributeError: pass
logging.basicConfig(filename='/var/log/twittermailgate.log',level=logging.DEBUG) api = twitter.Api() api = twitter.Api(consumer_key='', consumer_secret='', access_token_key='', access_token_secret='') parser = email.parser.Parser() e = parser.parse(sys.stdin) sender = e['From'] myaddy = '*****@*****.**' message = e.get_payload() if isinstance(message, str): logging.debug("Looks like a normal tweet") else: logging.debug("I think this is a media tweet") try: smtpObj = smtplib.SMTP('localhost') smtpObj.sendmail(myaddy, '@twitpic.com', e.as_string()) logging.info("Sent email to TwitPic") except Exception, err: logging.error("Error: unable to send email :" + str(err))
def sanitize_html(html): parser = html5lib.HTMLParser(tokenizer=sanitizer.HTMLSanitizer) return parser.parse(html).toxml()
# -*- coding: utf-8 -*- from pprint import pprint # 電子メールのデータを処理する(email) # 邦訳ドキュメントあり # メールを解析する(email.parser) import email import email.parser parser = email.parser.Parser() with open('email.txt') as f: m = parser.parse(f) pprint(type(m)) pprint(m.items()) with open('email.txt') as f: s = f.read() m = email.message_from_string(s) pprint(m.items()) f = open('email.txt') msg = email.message_from_file(f) pprint(type(msg)) pprint(msg.is_multipart()) pprint(msg.get_payload()) pprint(msg.keys()) pprint(msg.get('From')) pprint(msg.as_string())
def filter(Action, filelist, outputfile): """Score files in one of several ways depending on which Action is passed to it""" parser = email.parser.Parser() SF = SpamFilter("spam.brain","ham.brain") index = 0 if Action in (MARK_SPAM, MARK_NOT_SPAM): if Action == MARK_SPAM: brain = SF.spambrain SF.spambrain_modified = True else: brain = SF.hambrain SF.hambrain_modified = True for i in OPENIter(filelist): messages = [start_message(parser.parse(i))] try: brain.add_sample(wordtokenizer(messages)) except IOError: print 'x', index = index + 1 if (index % 100) == 0: print '.', sys.stdout.flush() #print i.name SF.save() elif Action == FILTER_SPAM_LIST: msgs, spams, hams, unknowns = 0,0,0,0 for i in OPENIter(filelist): l = list(wordtokenizer([start_message(parser.parse(i))])) spamscore = SF.spambrain.get_filescore(iter(l)) hamscore = SF.hambrain.get_filescore(iter(l)) msgs += 1 if spamscore == hamscore: type = ' unknown' unknowns += 1 elif spamscore > hamscore: type = ' spam' spams += 1 else: type = 'not spam' hams += 1 print i.name, type, hamscore,spamscore if msgs: print "Spam: ", spams*100.0/msgs, " Not spam: ",hams*100.0/msgs, " Unknown: ",unknowns*100.0/msgs elif Action == FILTER_SPAM_SAVE: for i in OPENIter(filelist): msg = parser.parse(i) l = list(wordtokenizer([start_message(msg)])) spamscore = SF.spambrain.get_filescore(iter(l)) hamscore = SF.hambrain.get_filescore(iter(l)) if spamscore == hamscore: msg['MarkovBrainSpamStatus'] = "Unknown" elif spamscore > hamscore: msg['MarkovBrainSpamStatus'] = "Spam" else: msg['MarkovBrainSpamStatus'] = "Not Spam" #in normal use cases, this should get redirected to a file from stdout if outputfile: saveout = sys.stdout outfile = open(outputfile,'a') sys.stdout = outfile print str(msg) sys.stdout = saveout else: print str(msg)
def parseMail(filename): return_type={} # Open the file for parsing. email = parser.parse(open(filename,"r")) # Get string containing the sender section of email. from_email = obtainFromEmail(email) # Parse the string sender section to get an email_address. The is in the form of a list. return_type["from"] = text_parsing.parse_addresses(from_email) # Similary obtain the string which is in the "To" section of email. to_email = obtainToEmail(email) # Get a list of parsed email ids from this parsed section. return_type["to"] = text_parsing.parse_addresses(to_email) # Get the string representation of the cc section. cc_email = obtainCcEmail(email) # Convert it to list of parsed email ids. return_type["cc"] = text_parsing.parse_addresses(cc_email) # Obtain the date the email was sent. date = obtainDateEmail(email) if date!=None: comma = date[0].split(",") if len(comma) ==2: comma = comma[1].strip() else: comma = comma[0].strip() # If there is a valid date then note it down provided_date = comma.split(" ") if len(provided_date)>1 and len(provided_date[1]) == 1: provided_date[1] = "0" +provided_date[1] add_subtract = 1 if len(provided_date)>4 and len(provided_date[4])>4 and provided_date[4][0] == "-": add_subtract = -1 try: timezone_hours = int(provided_date[4][1:3]) timezone_minutes = int(provided_date[4][3:5]) except: timezone_hours = 0 timezone_minutes = 0 provided_date = " ".join(provided_date[0:4]) try: datetime_object= datetime.datetime.strptime(provided_date,"%d %b %Y %H:%M:%S") new_date_utc = datetime_object + add_subtract* datetime.timedelta(hours = timezone_hours, minutes = timezone_minutes) print_date = new_date_utc.strftime("%d %b %Y %H:%M:%S") return_type["date"] = print_date except: logger.debug("Could not find date in the correct format. Skipping it " + date[0]) return_type["date"] = "" else: # Else no valid date return_type["date"] = "" # Now obtain the subject section of the email. subject = obtainSubjectEmail(email) if subject!=None: #If there is a valid subject # save the original subject for printing it as raw text return_type["original_subject"] = subject[0] # This parses the subject and removes all the stopwords and everything. to give cleaner tokenized subject subject,extra_info = text_parsing.parse_text(subject[0]) return_type["subject"] = subject.replace("\"","") # Finally obtain the text section of the email. message = obtainMessageEmail(email) if message == None: return return_type # Parse this email text and remove the headers and footers, stopwords, and cleaned and tokenized. message,original_text = text_parsing.parse_message(message) return_type["message"] = message return_type["original_text"] = original_text # Return the email object. return return_type