def email_parser(email_file): html_flag = 0 with open(email_file, 'rb') as fp: msg = BytesParser(policy=policy.default).parse(fp) # print('Subject:', msg['subject']) if not msg.is_multipart(): # print("Singular email") if msg.get_content_maintype() == "text": if msg.get_content_subtype() == "plain": # print(msg.get_content_type()) body = msg.get_body(preferencelist='text/plain') # print(body) elif msg.get_content_subtype() == "html": # print(msg.get_content_type()) body = msg.get_body(preferencelist='html') # print("----Body from get_body()-------") # print(body) html_body = str(body).split("\n")[3:] html_body = '\n'.join(html_body) # print("----Parsed text through beautiful soup-------") body = html_parse(html_body) # print(body) else: print("Don't know if html or text {}".format( msg.get_content_subtype())) else: print("Email is multipart") i = 0 for part in msg.walk(): i = i + 1 print("part " + str(i)) cdispo = str(part.get('Content-Disposition')) print(cdispo) print(part.get_content_type()) print(part.get_content_subtype()) if part.get_content_type( ) == 'multipart/alternative' or part.get_content_type( ) == 'multipart/related': body = part.get_body(preferencelist='html') print("----Body from get_body()-------") print(body) html_body = str(body).split("\n")[3:] html_body = '\n'.join(html_body) print("----Parsed text through beautiful soup-------") body = html_parse(html_body) print(body) if part.get_content_type() == 'text/plain': body = part.get_payload(decode=True) # decode print(body) break return body
def extract_text(self, current_file) -> dict: """Extract the current email's text""" try: with open(current_file, 'rb') as eml_f: msg = BytesParser(policy=policy.default).parse(eml_f) if msg.is_multipart(): for part in msg.walk(): if part.get_content_type() == 'text/html': soup = BeautifulSoup(part.get_content(), 'html.parser') body = soup.findAll(text=True) # extract the text # process the text list into a formatted string body = ' '.join(body) \ .translate(str.maketrans('', '', string.punctuation)) \ .lower() body = SPACES.sub(" ", body) body = NEWLINE.sub("", body) body = TABS.sub(" ", body) body = ''.join( [i if ord(i) < 128 else ' ' for i in body]) #NOTE: update for dms_claims project (5/17/19) if self.project == 'dms_claims': self.mapping_dict.update({}) #NOTE: END// self.mapping_dict.update( {os.path.basename(current_file): body}) self.file_counter += 1 return {os.path.basename(current_file): body} except OSError as e: if current_file in self.error_files: pass else: self.error_file_counter += 1 self.error_files.append( os.path.basename(current_file)) # added: 4/16/2019 #logger.error(error=f'OSError: Could not parse email: {os.path.basename(current_file)}') #logger.error(error=f"Python Exception: {e}") # added: 5/1/2019 except Exception as e: # added: 5/1/2019 if current_file in self.error_files: pass else: self.error_file_counter += 1 self.error_files.append(os.path.basename(current_file))
def split_email(self, raw_email): parsed_email = BytesParser().parsebytes(raw_email) to_keep = [] attachments = [] if parsed_email.is_multipart(): for p in parsed_email.get_payload(): if p.get_filename(): filename = decode_header(p.get_filename()) if filename[0][1]: filename = filename[0][0].decode(filename[0][1]) else: filename = filename[0][0] attachments.append(File(p.get_payload(decode=True), filename)) else: to_keep.append(p) else: to_keep.append(parsed_email.get_payload()) return to_keep, attachments, parsed_email
def extract_text(self, current_file: str) -> dict: try: with open(current_file, 'rb') as eml_file: #logger.info(info=f'Eml file: {os.path.basename(current_file)}') msg = BytesParser(policy=policy.default).parse(eml_file) if msg.is_multipart(): for part in msg.walk(): if part.get_content_type() == 'text/html': soup = BeautifulSoup(part.get_content(), 'html.parser') body = soup.findAll(text=True) # extract the text # check if the body of the eml file is None or 0 if not body: self.error_file_counter += 1 self.error_files.append( os.path.basename(current_file)) return f"No text body in email: {os.path.basename(current_file)}" else: # process the text list into a formatted string body = ' '.join(body) \ .translate(str.maketrans('', '', string.punctuation)) \ .lower() body = SPACES.sub(" ", body) body = NEWLINE.sub("", body) body = TABS.sub(" ", body) body = ''.join( [i if ord(i) < 128 else ' ' for i in body]) #print(f"body := {body}") # UPDATE: added 6/20/2019 if len(body) == 0: # not text was extracted from this file; add to error files list self.error_file_counter += 1 self.error_files.append( os.path.basename(current_file)) logger.error( error= f"Eml file: {os.path.basename(current_file)} has no text body." ) # update the mapping dict if the file is not currently in the mapping dictionary if os.path.basename( current_file ) not in self.mapping_dict.keys(): self.mapping_dict[os.path.basename( current_file)] = body self.file_counter += 1 return {os.path.basename(current_file): body} else: return f"Eml File: {os.path.basename(current_file)} has already been read in." else: # UPDATE: added 6/20/2019 # if email is not multipart, we can extract the text directly try: if msg.get_content_type() == 'text/html': soup = BeautifulSoup(msg.get_content(), 'html.parser') body = soup.findAll(text=True) # extract the text # process the text list into a formatted string body = ' '.join(body) \ .translate(str.maketrans('', '', string.punctuation)) \ .lower() body = SPACES.sub(" ", body) body = NEWLINE.sub("", body) body = TABS.sub(" ", body) body = ''.join( [i if ord(i) < 128 else ' ' for i in body]) #print(f"body := {body}") # update the mapping dict if the file is not currently in the mapping dictionary if os.path.basename( current_file ) not in self.mapping_dict.keys(): self.mapping_dict[os.path.basename( current_file)] = body self.file_counter += 1 return {os.path.basename(current_file): body} else: return f"Eml File: {os.path.basename(current_file)} has already been read in." except Exception as e: # NOTE: *added 06/28/2019* self.error_file_counter += 1 self.error_files.append(os.path.basename(current_file)) logger.error( error= f'Eml file: {os.path.basename(current_file)} could not be text mined.' ) logger.error(error=e) except (OSError, Exception) as e: # update the error file information self.error_file_counter += 1 self.error_files.append(os.path.basename(current_file)) logger.error( error= f'Eml file: {os.path.basename(current_file)} could not be text mined.' ) logger.error(error=e)
class EmailReader: """Creates an object for email parsing""" def __init__(self): self.emailPath = "" self.subjectField = "" self.fromField = "" self.toField = "" self.htmlBody = "" self.textBody = "" self.replyTo = "" self.returnPath = "" def readEmail(self, emailPath): """Reads an email for parsing""" f = open(emailPath, "rb") self.msg = BytesParser(policy=policy.default).parse(f) f.close() def getFrom(self, mode="address"): """Gets the from field. :param mode: what type of way in getting the from field address -> Returns only the address name -> Returns only the name full -> Returns both the name and address """ fromField = self.msg["From"] if mode == "full": return fromField elif mode == "address": if "<" in fromField: temp = fromField.split("<")[-1][:-1] return temp else: return "" elif mode == "name": if "<" in fromField: temp = fromField.split("<")[0] return temp.strip() else: return "" else: raise Exception( "Parameter is undefined!\nAvailable options are only: \"address\", \"name\", and \"full\"" ) def getSubject(self): """Gets the subject field""" return self.msg["Subject"] def getReplyTo(self): """Gets the Reply-To field""" return self.msg["Reply-To"] def getReturnPath(self): """Gets the Return-Path field""" return self.msg["Return-Path"] def getHeader(self, header=""): """Gets any header""" if header == "": return "" else: try: return self.msg[header] except: return "" def getBody(self, mode="all"): """Gets the body. :param mode: what type of way in getting the email's body. all -> Returns both html and text html -> Returns only the html text -> Returns only the text """ htmlBody = "" textBody = "" if self.msg.is_multipart(): # Iterate for each part and check if it's the "body" part, text or html for part in self.msg.walk(): # Check if its HTML and it is not an attachment if part.get_content_type( ) == "text/html" and part.get_content_disposition( ) != "attachment": # Store the part in "s" variable in standard latin-1 encoding self.htmlBody = part.get_payload( decode=True).decode('ISO-8859-1') htmlBody = self.htmlBody # Since this is in HTML format, we need to strip all the HTML tags, we use BeautifulSoup # For plain text and not an attachment if part.get_content_type( ) == "text/plain" and part.get_content_disposition( ) != "attachment": # Place the text part to "s" variable in standard latin-1 encoding self.textBody = part.get_payload( decode=True).decode('ISO-8859-1') textBody = self.textBody if mode == "all": return htmlBody, textBody elif mode == "html": return htmlBody elif mode == "text": return textBody
print("Nebyl uveden žádný soubor") argument = 1 while (argument < len(sys.argv)): skore = 1 with open(sys.argv[argument], 'rb') as fp: msg = BytesParser(policy=policy.default).parse(fp) text = "" text2 = "" try: try: text = msg.get_body( preferencelist=('plain')).get_content() # čistý text #print("metoda 1") except: if msg.is_multipart(): for payload in msg.get_payload(): #print("metoda 2a") # if payload.is_multipart(): ... text2 = payload.get_payload() else: text2 = msg.get_payload() #print("metoda 2b") text = html2text.html2text(text2) except: text = "" #print(text2) text = text.replace('\n', ' ') #print(text) odesilatel = msg['from'] # odesílatel prijemce = msg['to'] # příjemce