def _encode_parts(self, header_data, msg_data, encoder): """Encodes any MIME part in the current message that is 8-bit. :type header_data: :py:obj:`bytes` :type msg_data: :py:obj:`bytes` """ self.headers = None self.message = None if six.PY3: msg = BytesParser().parsebytes(header_data+msg_data) else: msg = Parser().parsestr(header_data+msg_data) for part in msg.walk(): if not part.is_multipart(): payload = part.get_payload() try: payload.encode('ascii') except UnicodeError: del part['Content-Transfer-Encoding'] encoder(part) self.parse_msg(msg)
def _get_email_content(uid, data): content = dict(text=None, html=None, attachments=[]) email = BytesParser(policy=policy.default).parsebytes(data) for part in email.walk(): if part.is_multipart(): continue if part.is_attachment(): content['attachments'].append(_read_attachment(part, uid)) continue if part.get_content_type() == 'text/plain': content['text'] = _read_text(part) continue if part.get_content_type() == 'text/html': content['html'] = _read_html(part, uid) continue if content['html'] and not content['text']: tmp = open(content['html'], 'r') content['text'] = tmp.read() tmp.close() return content
def get_email(num, conn): result = {} typ, content = conn.fetch(num, '(RFC822)') msg = BytesParser().parsebytes(content[0][1]) sub = msg.get('Subject') from_ = msg.get("From") # Body details result["From"] = decode_str(from_, "From") result["Subject"] = decode_str(sub, "Subject") result["File"] = [] for part in msg.walk(): if part.get_content_type() == "text/plain": body = part.get_payload(decode=True) charsets = part.get_charsets() result["Body"] = body.decode(charsets[0]) fileName = part.get_filename() if None != fileName: file_dict = {} file_dict["name"] = decode_str(fileName, "File") file_dict["attachment"] = part.get_payload(decode=True) file_dict["content_type"] = part.get_content_type() new_file = ContentFile(file_dict["attachment"]) file_obj = UploadedFile(new_file, file_dict["name"], file_dict["content_type"], new_file.size, None, None) result["File"].append(file_obj) # fileName_str = decode_str(fileName,"File") # att_path = os.path.join(settings.LOG_DIR,fileName_str) #result["File"] = part.get_payload(decode=True) # fp = open(att_path, 'wb') # fp.write(part.get_payload(decode=True)) # fp.close() return result
def get_content(num): print(num) type, data = raw_conn.fetch(num, '(RFC822)') email_date = get_date(email_list[int(count)]) try: msg = BytesParser().parsebytes(data[0][1]) for part in msg.walk(): if not part.is_multipart(): charset = part.get_charset() contenttype = part.get_content_type() content = part.get_payload(decode=True) content = content.decode('GBK') temp = time_formate(email_date) print(temp) if temp == '1': print(temp) get_transfer_v1(content) elif temp == '2': print(temp) get_transfer_v2(content) # #print (content) except TypeError: print('empty-email') except UnicodeDecodeError: print('hahah')
def fetch_and_parse(uids): ''' fetches and parses up to "commit_limit" new emails ''' result = list() for uid in uids: email_dict = dict() reply, email_data = imap_server.uid('fetch', uid, '(RFC822)') if reply == 'OK': raw_email = email_data[0][1] email = BytesParser(policy=default).parsebytes(raw_email) email_dict['Date'] = datetime.strptime( email['Date'], '%a, %d %b %Y %H:%M:%S %z') for header in [ 'From', 'To', 'Delivered-To', 'Message-ID', 'Subject' ]: email_dict[header] = email[header] email_dict['plain'] = None email_dict['html'] = None for part in email.walk(): if part.get_content_type() == 'text/html': email_dict['html'] = part.get_body().get_content() elif part.get_content_type() == 'text/plain': email_dict['plain'] = part.get_body().get_content() result.append(email_dict) return result
def __init__(self, data, group=None): """ Cribbed heavily from https://www.ianlewis.org/en/parsing-email-attachments-python """ Loggable.__init__(self, group=group) self.subject = None self.time = None self.attachment = None message = BytesParser(policy=policy.default).parsebytes(data) self.subject = str(message["Subject"]).replace("\r\n", "") self.body = str(message.get_body()) self.check_subject() self.check_body() self._set_time(message) self.log("info", 'Importing email: "{}"'.format(self.subject)) attachments = [] for part in message.walk(): content_disposition = part.get("Content-Disposition") if not content_disposition: continue dispositions = content_disposition.strip().split(";") if len(dispositions) < 2: continue if not dispositions[0].lower() == "attachment" and \ "filename" not in dispositions[1].lower(): continue file_data = part.get_payload() attachments.append( Attachment(b64decode(file_data), content_type=part.get_content_type())) if len(attachments) == 0: raise InvalidMessageError( "There don't appear to be any attachments to this message") if len(attachments) > 1: raise InvalidMessageError( "There's more than one attachment to this message. It cannot " "be indexed automatically.") self.attachment = attachments[0]
def get_content(num): print (num) type,data=raw_conn.fetch(num,'(RFC822)') msg=BytesParser().parsebytes(data[0][1]) for part in msg.walk(): if not part.is_multipart(): charset = part.get_charset() contenttype = part.get_content_type() content=part.get_payload(decode=True) content=content.decode('GBK') #get_transfer_v1(content) print (content)
def email_parser(email_file): html_flag = 0 with open(email_file, 'rb') as fp: msg = BytesParser(policy=policy.default).parse(fp) # print('Subject:', msg['subject']) if not msg.is_multipart(): # print("Singular email") if msg.get_content_maintype() == "text": if msg.get_content_subtype() == "plain": # print(msg.get_content_type()) body = msg.get_body(preferencelist='text/plain') # print(body) elif msg.get_content_subtype() == "html": # print(msg.get_content_type()) body = msg.get_body(preferencelist='html') # print("----Body from get_body()-------") # print(body) html_body = str(body).split("\n")[3:] html_body = '\n'.join(html_body) # print("----Parsed text through beautiful soup-------") body = html_parse(html_body) # print(body) else: print("Don't know if html or text {}".format( msg.get_content_subtype())) else: print("Email is multipart") i = 0 for part in msg.walk(): i = i + 1 print("part " + str(i)) cdispo = str(part.get('Content-Disposition')) print(cdispo) print(part.get_content_type()) print(part.get_content_subtype()) if part.get_content_type( ) == 'multipart/alternative' or part.get_content_type( ) == 'multipart/related': body = part.get_body(preferencelist='html') print("----Body from get_body()-------") print(body) html_body = str(body).split("\n")[3:] html_body = '\n'.join(html_body) print("----Parsed text through beautiful soup-------") body = html_parse(html_body) print(body) if part.get_content_type() == 'text/plain': body = part.get_payload(decode=True) # decode print(body) break return body
def _get_parts_regular(self, data): msg = BytesParser().parsebytes(data) yield from self._get_headparts(msg.items()) for part in msg.walk(): path = part.get_filename() data = part.get_payload(decode=True) if data is None: continue if path is None: path = F'BODY.{file_extension(part.get_content_subtype(), "TXT").upper()}' yield UnpackResult(path, data)
def get_content(num): print (num) type,data=conn.fetch(num,'(RFC822)') msg=BytesParser().parsebytes(data[0][1]) for part in msg.walk(): if not part.is_multipart(): charset = part.get_charset() contenttype = part.get_content_type() content=part.get_payload(decode=True) content=content.decode('GBK') #print (content) with open("messy.log","w") as f: f.write(content)
def __init__(self, data, verbosity=1): """ Cribbed heavily from https://www.ianlewis.org/en/parsing-email-attachments-python """ self.verbosity = verbosity self.subject = None self.time = None self.attachment = None message = BytesParser(policy=policy.default).parsebytes(data) self.subject = str(message["Subject"]).replace("\r\n", "") self.body = str(message.get_body()) self.check_subject() self.check_body() self._set_time(message) Log.info( 'Importing email: "{}"'.format(self.subject), Log.COMPONENT_MAIL) attachments = [] for part in message.walk(): content_disposition = part.get("Content-Disposition") if not content_disposition: continue dispositions = content_disposition.strip().split(";") if not dispositions[0].lower() == "attachment": continue file_data = part.get_payload() attachments.append(Attachment( b64decode(file_data), content_type=part.get_content_type())) if len(attachments) == 0: raise InvalidMessageError( "There don't appear to be any attachments to this message") if len(attachments) > 1: raise InvalidMessageError( "There's more than one attachment to this message. It cannot " "be indexed automatically." ) self.attachment = attachments[0]
def download_attachments(dir, uid, data): attachments = [] email = BytesParser(policy=policy.default).parsebytes(data) for part in email.walk(): if part.is_attachment(): attachment_name = part.get_filename() attachment = open( os.path.expanduser(os.path.join(dir, attachment_name)), "wb") attachment.write(part.get_payload(decode=True)) attachment.close() attachments.append(attachment_name) return attachments
def process_email(file_name): """Process parts of a MIME message store in file.""" with open(file_name, "rb") as fp: msg = BytesParser(policy=policy.default).parse(fp) for part in msg.walk(): debug(f"{part=}") msg_content_type = part.get_content_subtype() if msg_content_type == "html": debug(f"part is HTML: %s" % msg_content_type) charset = part.get_content_charset(failobj="utf-8") content = part.get_payload(decode=True).decode( charset, "replace") return content
def get_content(num): print(num) type, data = raw_conn.fetch(num, '(RFC822)') try: msg = BytesParser().parsebytes(data[0][1]) for part in msg.walk(): if not part.is_multipart(): charset = part.get_charset() contenttype = part.get_content_type() content = part.get_payload(decode=True) content = content.decode('GBK') get_transfer_v1(content) #print (content) except TypeError: print('empty-email') except UnicodeDecodeError: print('hahah')
def decode_eml(dir, filename): # './xxx/' print( '-------------------------------------------------------------------') print('Decoding: ' + dir + filename + "\n") # with open(dir + filename, 'r') as fp: fp = open(dir + filename, 'rb') # b => bytes msg = BytesParser(policy=policy.default).parse(fp) _from = msg.get('From') _to = msg.get('To') _subject = msg.get('Subject') print('From: ' + _from) print('To: ' + _to) print('Subject: ' + _subject + '\n') fp = open(dir + filename, 'r') msg = email.message_from_file(fp) for par in msg.walk(): # 对于每一个MIME块 if not par.is_multipart(): content_type = par.get('Content-Type') print('content_type: ' + content_type) name = par.get_param('filename') if name: h = Header(name) # 解码奇怪的文件名 dh = decode_header(h) fname = dh[0][0] # 附件名 print('附件:', str(fname, encoding='utf-8') + '\n') data = par.get_payload(decode=True) try: f = open(dir + str(fname, encoding='utf-8'), 'wb') # 注意一定要用wb来打开文件,因为附件一般都是二进制文件 f.write(data) f.close() except: print('error: 附件名含非法字符,存为tmp') f = open('tmp', 'wb') f.write(data) f.close() else: print( '文本内容: ', str(par.get_payload(decode=True), encoding='utf-8') + '\n') fp.close() print( '--------------------------------End--------------------------------\n' )
def extract_text(self, current_file) -> dict: """Extract the current email's text""" try: with open(current_file, 'rb') as eml_f: msg = BytesParser(policy=policy.default).parse(eml_f) if msg.is_multipart(): for part in msg.walk(): if part.get_content_type() == 'text/html': soup = BeautifulSoup(part.get_content(), 'html.parser') body = soup.findAll(text=True) # extract the text # process the text list into a formatted string body = ' '.join(body) \ .translate(str.maketrans('', '', string.punctuation)) \ .lower() body = SPACES.sub(" ", body) body = NEWLINE.sub("", body) body = TABS.sub(" ", body) body = ''.join( [i if ord(i) < 128 else ' ' for i in body]) #NOTE: update for dms_claims project (5/17/19) if self.project == 'dms_claims': self.mapping_dict.update({}) #NOTE: END// self.mapping_dict.update( {os.path.basename(current_file): body}) self.file_counter += 1 return {os.path.basename(current_file): body} except OSError as e: if current_file in self.error_files: pass else: self.error_file_counter += 1 self.error_files.append( os.path.basename(current_file)) # added: 4/16/2019 #logger.error(error=f'OSError: Could not parse email: {os.path.basename(current_file)}') #logger.error(error=f"Python Exception: {e}") # added: 5/1/2019 except Exception as e: # added: 5/1/2019 if current_file in self.error_files: pass else: self.error_file_counter += 1 self.error_files.append(os.path.basename(current_file))
def get_email(num, conn): typ, content = conn.fetch(num, '(RFC822)') msg = BytesParser().parsebytes(content[0][1]) #print(msg) sub = msg.get('Subject') sender = msg.get('X-Sender') date = msg.get('Date') for part in msg.walk(): # fileName = part.get_filename() # fileName = decode_str(fileName) # if None != fileName: # print('+++++++++++++++++++') # print(fileName) if not part.is_multipart(): #print('+++++++++++++++++++') #print(part.get_payload(decode=True).decode('utf-8')) print(num, decode_str(sub), decode_str(sender), decode_str(date)) return part.get_payload(decode=True).decode('utf-8')
def parse_email(raw_email_decoded): ''' parse email ''' email_dict = dict() raw_email = raw_email_decoded.encode() email = BytesParser(policy=default).parsebytes(raw_email) email_dict['Date'] = datetime.strptime(email['Date'], '%a, %d %b %Y %H:%M:%S %z') for header in ['From', 'To', 'Delivered-To', 'Message-ID', 'Subject']: email_dict[header] = email[header] email_dict['plain'] = None email_dict['html'] = None for part in email.walk(): if part.get_content_type() == 'text/html': email_dict['html'] = part.get_body().get_content() elif part.get_content_type() == 'text/plain': email_dict['plain'] = part.get_body().get_content() return email_dict
def _get_parts_regular(self, data): if not re.match(BR'^[\s!-~]+$', data): raise ValueError('This is not a plaintext email message.') msg = BytesParser().parsebytes(data) yield from self._get_headparts(msg.items()) for k, part in enumerate(msg.walk()): path = part.get_filename() elog = None if path is None: extension = file_extension(part.get_content_type(), 'txt') path = F'body.{extension}' else: path = F'attachments/{path}' try: data = part.get_payload(decode=True) except Exception as E: try: data = part.get_payload(decode=False) except Exception as E: elog = str(E) data = None else: from refinery import carve self.log_warn(F'manually decoding part {k}, data might be corrupted: {path}') if isinstance(data, str): data = data.encode('latin1') if isbuffer(data): data = next(data | carve('b64', stripspace=True, single=True, decode=True)) else: elog = str(E) data = None if not data: if elog is not None: self.log_warn(F'could not get content of message part {k}: {elog!s}') continue yield UnpackResult(path, data)
def get_email_content(uid, data): content = dict(text=None, html=None) email = BytesParser(policy=policy.default).parsebytes(data) for part in email.walk(): if part.is_multipart(): continue if part.get_content_type() == "text/plain": content["text"] = read_text(part) continue if part.get_content_type() == "text/html": content["html"] = read_html(part, uid) continue if content["html"] and not content["text"]: tmp = open(content["html"], "r") content["text"] = tmp.read() tmp.close() return content
def parse_message(db: SqliteStorage, crypto: ProviderNaCl, message: StoredMessage): """ This function will take a message and return the cleartext contents of the message as well as any protocol attachments contained in the message: Address pads and requests for pads """ key_id, private_key = db.get_own_address_nacl_key(message.header_address) if not private_key: # We are trying to parse a message for which we have no key. # This is never going to work out well, better to exit early. return (None, None, None) cleartext = crypto.decrypt(message.contents, private_key) # Cleartext is supposed to be a MIME formatted message msg = BytesParser(policy=policy.default).parsebytes(cleartext) content = [] address_pad = None address_pad_req = None for part in msg.walk(): # Account for stuff we know will turn up - Specifically wrappers and protocol # control messages. # Please note that we do not currently support multiple address pads / requests in the # same message. if part.get_content_type() == 'application/json': if part['Content-Description'] == NodeIntercom.address_pad_request_description: address_pad_req = NodeIntercom.AddressPadRequest.deserialize(part.get_content()) if part['Content-Description'] == NodeIntercom.address_pad_description: address_pad = NodeIntercom.AddressPad.deserialize(part.get_content()) elif (part.get_content_maintype() == 'multipart' or part.get_content_maintype() == 'application'): continue else: content.append(part.get_content()) msg_string = "From: {0}\nTo: {1}\n\n{2}".format(msg['from'], msg['to'], "\n".join(content)) return (msg_string, address_pad_req, address_pad)
def parse_email(raw_emails): ''' parse email ''' emails = list() for uid, length, raw_headers, raw_email in raw_emails: email_dict = dict() email = BytesParser(policy=default).parsebytes(raw_email) headers = BytesParser(policy=default).parsebytes(raw_headers) email_dict['uid'] = uid.decode() email_dict['length'] = length.decode() email_dict['Date'] = datetime.strptime(headers['Date'], '%a, %d %b %Y %H:%M:%S %z') email_dict['metadata'] = dict() for header in ['From', 'To', 'Delivered-To', 'Message-ID', 'Subject']: email_dict['metadata'][header] = headers[header] email_dict['plain'] = None email_dict['html'] = None email_dict['attachments'] = list() for part in email.walk(): # if not part.get('Content-Disposition'): if not part.is_attachment(): # get('Content-Disposition'): if part.get_content_type() == 'text/html': email_dict['html'] = part.get_body().get_content() elif part.get_content_type() == 'text/plain': email_dict['plain'] = part.get_body().get_content() else: attachment = dict() attachment['MIME'] = part.get_content_type() attachment['filename'] = part.get_filename() attachment['body'] = part.get_content() email_dict['attachments'].append(attachment) emails.append(email_dict) return emails
def extract_text(self, current_file: str) -> dict: try: with open(current_file, 'rb') as eml_file: #logger.info(info=f'Eml file: {os.path.basename(current_file)}') msg = BytesParser(policy=policy.default).parse(eml_file) if msg.is_multipart(): for part in msg.walk(): if part.get_content_type() == 'text/html': soup = BeautifulSoup(part.get_content(), 'html.parser') body = soup.findAll(text=True) # extract the text # check if the body of the eml file is None or 0 if not body: self.error_file_counter += 1 self.error_files.append( os.path.basename(current_file)) return f"No text body in email: {os.path.basename(current_file)}" else: # process the text list into a formatted string body = ' '.join(body) \ .translate(str.maketrans('', '', string.punctuation)) \ .lower() body = SPACES.sub(" ", body) body = NEWLINE.sub("", body) body = TABS.sub(" ", body) body = ''.join( [i if ord(i) < 128 else ' ' for i in body]) #print(f"body := {body}") # UPDATE: added 6/20/2019 if len(body) == 0: # not text was extracted from this file; add to error files list self.error_file_counter += 1 self.error_files.append( os.path.basename(current_file)) logger.error( error= f"Eml file: {os.path.basename(current_file)} has no text body." ) # update the mapping dict if the file is not currently in the mapping dictionary if os.path.basename( current_file ) not in self.mapping_dict.keys(): self.mapping_dict[os.path.basename( current_file)] = body self.file_counter += 1 return {os.path.basename(current_file): body} else: return f"Eml File: {os.path.basename(current_file)} has already been read in." else: # UPDATE: added 6/20/2019 # if email is not multipart, we can extract the text directly try: if msg.get_content_type() == 'text/html': soup = BeautifulSoup(msg.get_content(), 'html.parser') body = soup.findAll(text=True) # extract the text # process the text list into a formatted string body = ' '.join(body) \ .translate(str.maketrans('', '', string.punctuation)) \ .lower() body = SPACES.sub(" ", body) body = NEWLINE.sub("", body) body = TABS.sub(" ", body) body = ''.join( [i if ord(i) < 128 else ' ' for i in body]) #print(f"body := {body}") # update the mapping dict if the file is not currently in the mapping dictionary if os.path.basename( current_file ) not in self.mapping_dict.keys(): self.mapping_dict[os.path.basename( current_file)] = body self.file_counter += 1 return {os.path.basename(current_file): body} else: return f"Eml File: {os.path.basename(current_file)} has already been read in." except Exception as e: # NOTE: *added 06/28/2019* self.error_file_counter += 1 self.error_files.append(os.path.basename(current_file)) logger.error( error= f'Eml file: {os.path.basename(current_file)} could not be text mined.' ) logger.error(error=e) except (OSError, Exception) as e: # update the error file information self.error_file_counter += 1 self.error_files.append(os.path.basename(current_file)) logger.error( error= f'Eml file: {os.path.basename(current_file)} could not be text mined.' ) logger.error(error=e)
class EmlParser(): def __init__(self, fileName): self.message = BytesParser(policy=policy.default).parsebytes( readFile(fileName)) def getId(self): return getHashOfItem(self.message) def getAttachmentData(self, name): for part in self.message.walk(): if 'content-disposition' not in part: continue cdisp = part['content-disposition'].split(';') cdisp = [x.strip() for x in cdisp] if cdisp[0].lower() != 'attachment': continue parsed = {} for kv in cdisp[1:]: if kv.startswith('filename='): key, _, val = kv.partition('=') if val.startswith('"'): val = val.strip('"') elif val.startswith("'"): val = val.strip("'") if (name == val): return part.get_payload(decode=True) return None def getAttachmentNames(self): found = [] for part in self.message.walk(): if 'content-disposition' not in part: continue cdisp = part['content-disposition'].split(';') cdisp = [x.strip() for x in cdisp] if cdisp[0].lower() != 'attachment': continue parsed = {} for kv in cdisp[1:]: if kv.startswith('filename='): key, _, val = kv.partition('=') if val.startswith('"'): val = val.strip('"') elif val.startswith("'"): val = val.strip("'") found.append(val) return found def getPayloadHtml(self): body = self.message.get_body('html') if (body): return self._decode_body(body.get_payload(decode=True)) return '' def getPayloadPlain(self): body = self.message.get_body('plain') if (body): return self._decode_body(body.get_payload(decode=True)) return '' def getSender(self): return extractEmails(str(self.message['from'])) def getReceivers(self): return extractEmails(str(self.message['to'])) def getSubject(self): return self._decode_entry(self.message['Subject']) def getDate(self): dt = parse(self.message['Date']) return str(dt.date()) + " " + str(dt.time()) def _decode_entry(self, entry): if entry is None: entry = '' else: result = '' for part in decode_header(entry): if isinstance(part[0], str): result += part[0] else: encoding = part[1] result += part[0].decode(encoding) entry = result return entry def _decode_body(self, entry): try: entry = entry.decode('utf-8') except UnicodeDecodeError: entry = entry.decode('latin-1') return entry
def __call__(self, content): '''Parse an email message in "content", which is a string or a text input object. /content/ Standard encoded email message content. Returns parsed message in a dict of (subject, date, body, html, from, to, attachments). ''' if isinstance(content, bytes): msgobj = BytesParser().parsebytes(content) else: msgobj = StrParser().parse(StringIO(content)) subject = parse_header('Subject', msgobj) date = parse_header('Date', msgobj) received = [] for part in (msgobj.get_all('Received') or []): lx = self.re_received.split(part) tmp = dict(zip(lx[1::2], [ x.strip() for x in lx[2::2] ])) tx = tmp.get(';') if tx: tmp['time'] = parse_time(tx) received.append(tmp) fromaddr = parse_addr(msgobj, 'From') if date: date = date.replace(',', '') logger.debug('Parsing message: Date={0}, Subject={1}'.format(date, subject)) #-------- Parsing attachments: attachments = [] body = None html = None for part in msgobj.walk(): attachment = parse_attachment(part) if attachment: attachments.append(attachment) else: # parse text content content_type = part.get_content_type() if content_type[0:5] == 'text/': payload = str(part.get_payload(decode=True), part.get_content_charset() or 'ascii', 'replace').encode('utf8','replace') if content_type == "text/plain": if body is None: body = '' body += str(payload) elif content_type == "text/html": if html is None: html = '' html += str(payload) else: logger.debug('Ignored: Content_type "{0}" in message "{1}" from {2}, Date={3}'.format(content_type, subject, fromaddr, date)) return { 'subject' : subject, 'date' : date, 'received': received, # 'received': sorted(received, key=lambda k: k['time']), 'body' : body, 'html' : html, 'from' : fromaddr, 'to' : parse_addr(msgobj, 'To'), 'cc' : parse_addr(msgobj, 'CC'), 'bcc' : parse_addr(msgobj, 'BCC'), 'attachments': attachments }
def display_eml(eml_filepath): ## -> treba vyladit!!! with open(eml_filepath, 'rb') as eml_file: msg = BytesParser(policy=policy.default).parse(eml_file) text = msg.get_body(preferencelist=('plain')).get_content() # sk = get_info_from_mail_field(msg['from']) # eml_output = eml_file.read() eml_output = msg # eml_output = msg #get_all('Content-Dispositio found = [] for part in msg.walk(): if 'content-disposition' not in part: continue cdisp = part['content-disposition'].split(';') cdisp = [x.strip() for x in cdisp] if cdisp[0].lower() != 'attachment': continue parsed = {} for kv in cdisp[1:]: key, val = kv.split('=') if val.startswith('"'): val = val.strip('"') elif val.startswith("'"): val = val.strip("'") parsed[key] = val found.append((parsed, part)) eml_output = { "Odesílatel": msg.get('From'), "Příjemce": msg.get('To'), "Datum": msg.get('Date'), "Předmět": msg.get('Subject'), "Text zprávy": msg.get_body(preferencelist=('plain')).get_content(), "Přílohy": found #[0] } #print('eml_output',eml_output, msg.get('Cc')) if msg.get_content_maintype() == 'multipart': # <--zjisti zda potrebujes - jinak smaz # loop on the parts of the mail for part in msg.walk(): # find the attachment part - so skip all the other parts if part.get_content_maintype() == 'multipart': continue if part.get_content_maintype() == 'text': content = part.get_body(preferencelist=('plain')) if content: output = part.get_body(preferencelist=('plain')).get_content() else: output = None continue if part.get('Content-Disposition') == 'inline': continue if part.get('Content-Disposition') is None: continue # save the attachment in the program directory result_dict = { "Odesílatel": msg.get('From'), "Příjemce": msg.get('To'), "Datum": msg.get('Date'), "Předmět": msg.get('Subject'), "Text zprávy": output, #msg.get_body(preferencelist=('plain')).get_content(), "Přílohy": part.get_all('Content-Disposition') } #eml_output = result_dict #print('result_dict',result_dict) return eml_output
def email_analysis(filename, exclude_private_ip): urlList = [] domainList = [] hopList = [] hopListIP = [] attachList = [] data = {} data["data"] = [] with open(filename, "rb") as fp: msg = BytesParser(policy=policy.default).parse(fp) if msg: # Identify each url or attachment reported in the eMail body for part in msg.walk(): if part.get_content_type( ) == "text/plain" or part.get_content_type() == "text/html": extractor = URLExtract() urlList.extend(extractor.find_urls(part.get_content())) else: if part.get_filename(): attachList.append(part.get_filename()) # Identify each domain reported in the eMail body for url in urlList: analyzeddomain = tldcache(url).registered_domain if analyzeddomain: domainList.append(analyzeddomain) # Remove Duplicate urlList = list(set(urlList)) domainList = list(set(domainList)) # A sender obfuscation technique involves entering two e-mails. Only the last one is the real one. Example: # # Sender Name: Mario Rossi <*****@*****.**> # Sender Mail: [email protected] if msg["From"]: mail_from = re.findall( "[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,6}", msg["From"], re.IGNORECASE) mail_from = mail_from[-1] else: mail_from = "" if msg["Sender"]: mail_sender = msg["Sender"] else: mail_sender = "" if msg["Subject"]: mail_subject = msg["Subject"] else: mail_subject = "" if msg["X-Originating-IP"]: mail_xorigip = msg["X-Originating-IP"] else: mail_xorigip = "" data["data"].append({ "Filename": os.path.basename(filename), "From": mail_from, "Sender": mail_sender, "Subject": mail_subject, "X-Originating-IP": mail_xorigip, "attachments": [], "relay_full": [], "relay_ip": [], "urls": [], "domains": [] }) # Identify each relay received = msg.get_all("Received") if received: received.reverse() for line in received: hops = re.findall( "from\s+(.*?)\s+by(.*?)(?:(?:with|via)(.*?)(?:id|$)|id|$)", line, re.DOTALL | re.X) for hop in hops: ipv4_address = re.findall(r"[0-9]+(?:\.[0-9]+){3}", hop[0], re.DOTALL | re.X) # https://gist.github.com/dfee/6ed3a4b05cfe7a6faf40a2102408d5d8 ipv6_address = re.findall( r"(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,4}:[^\s:](?:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9]).){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])))|(?:::(?:ffff(?::0{1,4}){0,1}:){0,1}[^\s:](?:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9]).){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])))|(?:fe80:(?::(?:(?:[0-9a-fA-F]){1,4})){0,4}%[0-9a-zA-Z]{1,})|(?::(?:(?::(?:(?:[0-9a-fA-F]){1,4})){1,7}|:))|(?:(?:(?:[0-9a-fA-F]){1,4}):(?:(?::(?:(?:[0-9a-fA-F]){1,4})){1,6}))|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,2}(?::(?:(?:[0-9a-fA-F]){1,4})){1,5})|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,3}(?::(?:(?:[0-9a-fA-F]){1,4})){1,4})|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,4}(?::(?:(?:[0-9a-fA-F]){1,4})){1,3})|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,5}(?::(?:(?:[0-9a-fA-F]){1,4})){1,2})|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,6}:(?:(?:[0-9a-fA-F]){1,4}))|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,7}:)|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){7,7}(?:(?:[0-9a-fA-F]){1,4}))", hop[0], re.DOTALL | re.X) if ipv4_address: if ipaddress.ip_address(ipv4_address[0]): if ipaddress.ip_address( ipv4_address[0]).is_private: if not exclude_private_ip: hopListIP.append(ipv4_address[0]) else: hopListIP.append(ipv4_address[0]) if ipv6_address: if ipaddress.ip_address(ipv6_address[0]): if ipaddress.ip_address( ipv6_address[0]).is_private: if not exclude_private_ip: hopListIP.append(ipv6_address[0]) else: hopListIP.append(ipv6_address[0]) if hop[0]: hopList.append(hop[0]) if attachList: data["data"][0]["attachments"].append( dict(zip(range(len(attachList)), attachList))) if hopList: data["data"][0]["relay_full"].append( dict(zip(range(len(hopList)), hopList))) if hopListIP: data["data"][0]["relay_ip"].append( dict(zip(range(len(hopListIP)), hopListIP))) if urlList: data["data"][0]["urls"].append( dict(zip(range(len(urlList)), urlList))) data["data"][0]["domains"].append( dict(zip(range(len(domainList)), domainList))) print(json.dumps(data, indent=4))
def email_analysis(filename, exclude_private_ip, check_spf): urlList = [] hopList = [] hopListIP = [] domainList = [] attachmentsList = [] hopListIPnoPrivate = [] resultmeioc = { "filename": os.path.basename(filename), "from": None, "sender": None, "x-sender": None, "to": None, "cc": None, "bcc": None, "envelope-to": None, "delivered-to": None, "subject": None, "x-originating-ip": None, "relay_full": None, "relay_ip": None, "spf": None, "urls": None, "domains": None, "attachments": None } with open(filename, "rb") as fp: msg = BytesParser(policy=policy.default).parse(fp) if msg: # # Header analysis # if msg["From"]: # A sender obfuscation technique involves entering two e-mails. Only the last one is the real one. Example: # # Sender Name: Mario Rossi <*****@*****.**> # Sender Mail: [email protected] mail_from = re.findall( "[A-Za-z0-9.!#$%&'*+\/=?^_`{|}~\-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,6}", msg["From"], re.IGNORECASE) if mail_from: resultmeioc["from"] = mail_from[-1] if msg["Sender"]: mail_sender = re.findall( "[A-Za-z0-9.!#$%&'*+\/=?^_`{|}~\-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,6}", msg["Sender"], re.IGNORECASE) if mail_sender: resultmeioc["sender"] = mail_sender[-1] if msg["X-Sender"]: mail_xsender = re.findall( "[A-Za-z0-9.!#$%&'*+\/=?^_`{|}~\-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,6}", msg["X-Sender"], re.IGNORECASE) if mail_xsender: resultmeioc["x-sender"] = mail_xsender[-1] if msg["To"]: mail_to = re.findall( "[A-Za-z0-9.!#$%&'*+\/=?^_`{|}~\-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,6}", msg["To"], re.IGNORECASE) if mail_to: # Remove possible duplicates and create a numbered dictionary mail_to = dict( zip(range(len(list(set(mail_to)))), list(set(mail_to)))) resultmeioc["to"] = mail_to if msg["Bcc"]: resultmeioc["bcc"] = msg["Bcc"] if msg["Cc"]: mail_cc = re.findall( "[A-Za-z0-9.!#$%&'*+\/=?^_`{|}~\-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,6}", msg["Cc"], re.IGNORECASE) if mail_cc: # Remove possible duplicates and create a numbered dictionary mail_cc = dict( zip(range(len(list(set(mail_cc)))), list(set(mail_cc)))) resultmeioc["cc"] = mail_cc if msg["Envelope-to"]: mail_envelopeto = re.findall( "[A-Za-z0-9.!#$%&'*+\/=?^_`{|}~\-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,6}", msg["Envelope-to"], re.IGNORECASE) if mail_envelopeto: # Remove possible duplicates and create a numbered dictionary mail_envelopeto = dict( zip(range(len(list(set(mail_envelopeto)))), list(set(mail_envelopeto)))) resultmeioc["envelope-to"] = mail_envelopeto if msg["Delivered-To"]: resultmeioc["delivered-to"] = msg["Delivered-To"] if msg["X-Originating-IP"]: # Usually the IP is in square brackets, I remove them if present. mail_xorigip = msg["X-Originating-IP"].replace("[", "").replace( "]", "") resultmeioc["x-originating-ip"] = mail_xorigip if msg["Subject"]: resultmeioc["subject"] = msg["Subject"] # Identify each relay received = msg.get_all("Received") if received: received.reverse() for line in received: hops = re.findall( "from\s+(.*?)\s+by(.*?)(?:(?:with|via)(.*?)(?:id|$)|id|$)", line, re.DOTALL | re.X) for hop in hops: ipv4_address = re.findall(r"[0-9]+(?:\.[0-9]+){3}", hop[0], re.DOTALL | re.X) # https://gist.github.com/dfee/6ed3a4b05cfe7a6faf40a2102408d5d8 ipv6_address = re.findall( r"(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,4}:[^\s:](?:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9]).){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])))|(?:::(?:ffff(?::0{1,4}){0,1}:){0,1}[^\s:](?:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9]).){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])))|(?:fe80:(?::(?:(?:[0-9a-fA-F]){1,4})){0,4}%[0-9a-zA-Z]{1,})|(?::(?:(?::(?:(?:[0-9a-fA-F]){1,4})){1,7}|:))|(?:(?:(?:[0-9a-fA-F]){1,4}):(?:(?::(?:(?:[0-9a-fA-F]){1,4})){1,6}))|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,2}(?::(?:(?:[0-9a-fA-F]){1,4})){1,5})|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,3}(?::(?:(?:[0-9a-fA-F]){1,4})){1,4})|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,4}(?::(?:(?:[0-9a-fA-F]){1,4})){1,3})|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,5}(?::(?:(?:[0-9a-fA-F]){1,4})){1,2})|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,6}:(?:(?:[0-9a-fA-F]){1,4}))|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,7}:)|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){7,7}(?:(?:[0-9a-fA-F]){1,4}))", hop[0], re.DOTALL | re.X) if ipv4_address: for ipv4 in ipv4_address: if ipaddress.ip_address(ipv4): hopListIP.append(ipv4) if not ipaddress.ip_address(ipv4).is_private: hopListIPnoPrivate.append(ipv4) if ipv6_address: for ipv6 in ipv6_address: if ipaddress.ip_address(ipv6) and not "6::": hopListIP.append(ipv6) if not ipaddress.ip_address(ipv6).is_private: hopListIPnoPrivate.append(ipv6) if hop[0]: hopList.append(hop[0]) if hopList: resultmeioc["relay_full"] = dict(zip(range(len(hopList)), hopList)) if hopListIP: if exclude_private_ip: resultmeioc["relay_ip"] = dict( zip(range(len(hopListIPnoPrivate)), hopListIPnoPrivate)) else: resultmeioc["relay_ip"] = dict( zip(range(len(hopListIP)), hopListIP)) # # Body analysis # for part in msg.walk(): if part.get_content_type() == "text/plain": # https://gist.github.com/dperini/729294 urlList.extend( re.findall( "(?:(?:(?:https?|ftp):)?\/\/)(?:\S+(?::\S*)?@)?(?:(?!(?:10|127)(?:\.\d{1,3}){3})(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z0-9\u00a1-\uffff][a-z0-9\u00a1-\uffff_-]{0,62})?[a-z0-9\u00a1-\uffff]\.)+(?:[a-z\u00a1-\uffff]{2,}\.?))(?::\d{2,5})?(?:[/?#]\S*)?", part.get_content(), re.UNICODE | re.IGNORECASE | re.MULTILINE)) if part.get_content_type() == "text/html": # The try/except is necessary, if the body of the eMail contains an incorrect or unencoded HTML code the script freeezes. try: soup = BeautifulSoup(part.get_content(), "html.parser") tags = soup.find_all("a", href=True) for url in tags: urlList.append(url.get("href")) except: pass if part.get_filename(): if part.get_payload(decode=True): filename = part.get_filename() filemd5 = hashlib.md5( part.get_payload(decode=True)).hexdigest() filesha1 = hashlib.sha1( part.get_payload(decode=True)).hexdigest() filesha256 = hashlib.sha256( part.get_payload(decode=True)).hexdigest() attachmentsList.append({ "filename": filename, "MD5": filemd5, "SHA1": filesha1, "SHA256": filesha256 }) # Identify each domain reported in the eMail body for url in urlList: analyzeddomain = tldcache(url).registered_domain if analyzeddomain: domainList.append(analyzeddomain) # Remove Duplicate urlList = list(set(urlList)) domainList = list(set(domainList)) if urlList: resultmeioc["urls"] = dict(zip(range(len(urlList)), urlList)) resultmeioc["domains"] = dict( zip(range(len(domainList)), domainList)) if attachmentsList: resultmeioc["attachments"] = attachmentsList # # Verify the SPF record if requested # if check_spf: testspf = False resultspf = "" for ip in hopListIPnoPrivate: if not testspf and "mail_from" in locals(): resultspf = spf.check2(ip, mail_from[-1], mail_from[-1].split("@")[1])[0] try: resultspf = spf.check2(ip, mail_from[-1], mail_from[-1].split("@")[1])[0] except: pass if resultspf == "pass": testspf = True else: testspf = False resultmeioc["spf"] = testspf print(json.dumps(resultmeioc, indent=4))
def _get_parts_regular(self, data): msg = BytesParser().parsebytes(data) return [ EmailPart(part.get_filename(), part.get_payload(decode=True)) for part in msg.walk() ]
# 获取指定邮件的内容(此处传入总长度,也就是获取最后一封邮件) # 相当于发送POP 3的retr命令 # resp保存服务器的响应码 # data保存该邮件的内容 resp, data, octets = conn.retr(len(mails)) # 将data的所有数据(原本是一个字节列表)拼接在一起 msg_data = b'\r\n'.join(data) # 将字符串内容解析成邮件,此处一定要指定policy=default msg = BytesParser(policy=default).parsebytes(msg_data) #① print(type(msg)) print('发件人:' + msg['from']) print('收件人:' + msg['to']) print('主题:' + msg['subject']) print('第一个收件人名字:' + msg['to'].addresses[0].username) print('第一个发件人名字:' + msg['from'].addresses[0].username) for part in msg.walk(): counter = 1 # 如果maintype是multipart,说明是容器(用于包含正文、附件等) if part.get_content_maintype() == 'multipart': continue # 如果maintype是multipart,说明是邮件正文部分 elif part.get_content_maintype() == 'text': print(part.get_content()) # 处理附件 else: # 获取附件的文件名 filename = part.get_filename() # 如果没有文件名,程序要负责为附件生成文件名 if not filename: # 根据附件的contnet_type来推测它的后缀名 ext = mimetypes.guess_extension(part.get_content_type())
class EmailReader: """Creates an object for email parsing""" def __init__(self): self.emailPath = "" self.subjectField = "" self.fromField = "" self.toField = "" self.htmlBody = "" self.textBody = "" self.replyTo = "" self.returnPath = "" def readEmail(self, emailPath): """Reads an email for parsing""" f = open(emailPath, "rb") self.msg = BytesParser(policy=policy.default).parse(f) f.close() def getFrom(self, mode="address"): """Gets the from field. :param mode: what type of way in getting the from field address -> Returns only the address name -> Returns only the name full -> Returns both the name and address """ fromField = self.msg["From"] if mode == "full": return fromField elif mode == "address": if "<" in fromField: temp = fromField.split("<")[-1][:-1] return temp else: return "" elif mode == "name": if "<" in fromField: temp = fromField.split("<")[0] return temp.strip() else: return "" else: raise Exception( "Parameter is undefined!\nAvailable options are only: \"address\", \"name\", and \"full\"" ) def getSubject(self): """Gets the subject field""" return self.msg["Subject"] def getReplyTo(self): """Gets the Reply-To field""" return self.msg["Reply-To"] def getReturnPath(self): """Gets the Return-Path field""" return self.msg["Return-Path"] def getHeader(self, header=""): """Gets any header""" if header == "": return "" else: try: return self.msg[header] except: return "" def getBody(self, mode="all"): """Gets the body. :param mode: what type of way in getting the email's body. all -> Returns both html and text html -> Returns only the html text -> Returns only the text """ htmlBody = "" textBody = "" if self.msg.is_multipart(): # Iterate for each part and check if it's the "body" part, text or html for part in self.msg.walk(): # Check if its HTML and it is not an attachment if part.get_content_type( ) == "text/html" and part.get_content_disposition( ) != "attachment": # Store the part in "s" variable in standard latin-1 encoding self.htmlBody = part.get_payload( decode=True).decode('ISO-8859-1') htmlBody = self.htmlBody # Since this is in HTML format, we need to strip all the HTML tags, we use BeautifulSoup # For plain text and not an attachment if part.get_content_type( ) == "text/plain" and part.get_content_disposition( ) != "attachment": # Place the text part to "s" variable in standard latin-1 encoding self.textBody = part.get_payload( decode=True).decode('ISO-8859-1') textBody = self.textBody if mode == "all": return htmlBody, textBody elif mode == "html": return htmlBody elif mode == "text": return textBody