def parser_header_from_file(self, text_file): # If the e-mail headers are in a file, uncomment these two lines: with open(text_file, 'rb') as fp: headers = BytesParser(policy=default).parse(fp) # Now the header items can be accessed as a dictionary: print('To: {}'.format(headers['to'])) print('From: {}'.format(headers['from'])) print('Subject: {}'.format(headers['subject'])) print('Content: \n{}'.format(headers.get_content())) # You can also access the parts of the addresses: print('Recipient username: {}'.format( headers['to'].addresses[0].username)) print('Sender name: {}'.format( headers['from'].addresses[0].display_name)) s = smtplib.SMTP(self.smtp_server) s.login(self.username, self.password) s.send_message(headers) s.quit()
def test_consume_and_sendmail_success(amqp_publish, smtp_messages, program_out_smtp, valid_email_message): message = valid_email_message amqp_publish(message) for line in program_out_smtp: if "Acked:" in line: break else: assert False, "Reached end of output without acking the message" for received_smtp in smtp_messages: received_email = EmailParser(policy=default_policy).parsebytes( received_smtp.message) assert received_smtp.sender == config.smtp_user assert received_email["from"] == config.smtp_user assert received_smtp.receivers == message["to"] assert received_email["to"] == ", ".join(message["to"]) assert received_smtp.remote_host[0] == SMTP_TEST_SERVER_HOST assert received_email["subject"] == message["subject"] assert received_email.get_content().strip() == message["content"] break else: assert False, "No messages received"
def extract_text(self, current_file: str) -> dict: try: with open(current_file, 'rb') as eml_file: #logger.info(info=f'Eml file: {os.path.basename(current_file)}') msg = BytesParser(policy=policy.default).parse(eml_file) if msg.is_multipart(): for part in msg.walk(): if part.get_content_type() == 'text/html': soup = BeautifulSoup(part.get_content(), 'html.parser') body = soup.findAll(text=True) # extract the text # check if the body of the eml file is None or 0 if not body: self.error_file_counter += 1 self.error_files.append( os.path.basename(current_file)) return f"No text body in email: {os.path.basename(current_file)}" else: # process the text list into a formatted string body = ' '.join(body) \ .translate(str.maketrans('', '', string.punctuation)) \ .lower() body = SPACES.sub(" ", body) body = NEWLINE.sub("", body) body = TABS.sub(" ", body) body = ''.join( [i if ord(i) < 128 else ' ' for i in body]) #print(f"body := {body}") # UPDATE: added 6/20/2019 if len(body) == 0: # not text was extracted from this file; add to error files list self.error_file_counter += 1 self.error_files.append( os.path.basename(current_file)) logger.error( error= f"Eml file: {os.path.basename(current_file)} has no text body." ) # update the mapping dict if the file is not currently in the mapping dictionary if os.path.basename( current_file ) not in self.mapping_dict.keys(): self.mapping_dict[os.path.basename( current_file)] = body self.file_counter += 1 return {os.path.basename(current_file): body} else: return f"Eml File: {os.path.basename(current_file)} has already been read in." else: # UPDATE: added 6/20/2019 # if email is not multipart, we can extract the text directly try: if msg.get_content_type() == 'text/html': soup = BeautifulSoup(msg.get_content(), 'html.parser') body = soup.findAll(text=True) # extract the text # process the text list into a formatted string body = ' '.join(body) \ .translate(str.maketrans('', '', string.punctuation)) \ .lower() body = SPACES.sub(" ", body) body = NEWLINE.sub("", body) body = TABS.sub(" ", body) body = ''.join( [i if ord(i) < 128 else ' ' for i in body]) #print(f"body := {body}") # update the mapping dict if the file is not currently in the mapping dictionary if os.path.basename( current_file ) not in self.mapping_dict.keys(): self.mapping_dict[os.path.basename( current_file)] = body self.file_counter += 1 return {os.path.basename(current_file): body} else: return f"Eml File: {os.path.basename(current_file)} has already been read in." except Exception as e: # NOTE: *added 06/28/2019* self.error_file_counter += 1 self.error_files.append(os.path.basename(current_file)) logger.error( error= f'Eml file: {os.path.basename(current_file)} could not be text mined.' ) logger.error(error=e) except (OSError, Exception) as e: # update the error file information self.error_file_counter += 1 self.error_files.append(os.path.basename(current_file)) logger.error( error= f'Eml file: {os.path.basename(current_file)} could not be text mined.' ) logger.error(error=e)