def parser_header_from_file(self, text_file):
        # If the e-mail headers are in a file, uncomment these two lines:
        with open(text_file, 'rb') as fp:
            headers = BytesParser(policy=default).parse(fp)
        # Now the header items can be accessed as a dictionary:
        print('To: {}'.format(headers['to']))
        print('From: {}'.format(headers['from']))
        print('Subject: {}'.format(headers['subject']))
        print('Content: \n{}'.format(headers.get_content()))

        # You can also access the parts of the addresses:
        print('Recipient username: {}'.format(
            headers['to'].addresses[0].username))
        print('Sender name: {}'.format(
            headers['from'].addresses[0].display_name))

        s = smtplib.SMTP(self.smtp_server)
        s.login(self.username, self.password)
        s.send_message(headers)
        s.quit()
def test_consume_and_sendmail_success(amqp_publish, smtp_messages,
                                      program_out_smtp, valid_email_message):
    message = valid_email_message
    amqp_publish(message)
    for line in program_out_smtp:
        if "Acked:" in line:
            break
    else:
        assert False, "Reached end of output without acking the message"
    for received_smtp in smtp_messages:
        received_email = EmailParser(policy=default_policy).parsebytes(
            received_smtp.message)
        assert received_smtp.sender == config.smtp_user
        assert received_email["from"] == config.smtp_user
        assert received_smtp.receivers == message["to"]
        assert received_email["to"] == ", ".join(message["to"])
        assert received_smtp.remote_host[0] == SMTP_TEST_SERVER_HOST
        assert received_email["subject"] == message["subject"]
        assert received_email.get_content().strip() == message["content"]
        break
    else:
        assert False, "No messages received"
Ejemplo n.º 3
0
    def extract_text(self, current_file: str) -> dict:
        try:
            with open(current_file, 'rb') as eml_file:
                #logger.info(info=f'Eml file: {os.path.basename(current_file)}')
                msg = BytesParser(policy=policy.default).parse(eml_file)
                if msg.is_multipart():
                    for part in msg.walk():
                        if part.get_content_type() == 'text/html':
                            soup = BeautifulSoup(part.get_content(),
                                                 'html.parser')
                            body = soup.findAll(text=True)  # extract the text

                            # check if the body of the eml file is None or 0
                            if not body:
                                self.error_file_counter += 1
                                self.error_files.append(
                                    os.path.basename(current_file))
                                return f"No text body in email: {os.path.basename(current_file)}"
                            else:
                                # process the text list into a formatted string
                                body = ' '.join(body) \
                                    .translate(str.maketrans('', '', string.punctuation)) \
                                    .lower()
                                body = SPACES.sub(" ", body)
                                body = NEWLINE.sub("", body)
                                body = TABS.sub(" ", body)
                                body = ''.join(
                                    [i if ord(i) < 128 else ' ' for i in body])
                                #print(f"body := {body}")
                                # UPDATE: added 6/20/2019
                                if len(body) == 0:
                                    # not text was extracted from this file; add to error files list
                                    self.error_file_counter += 1
                                    self.error_files.append(
                                        os.path.basename(current_file))
                                    logger.error(
                                        error=
                                        f"Eml file: {os.path.basename(current_file)} has no text body."
                                    )

                            # update the mapping dict if the file is not currently in the mapping dictionary
                            if os.path.basename(
                                    current_file
                            ) not in self.mapping_dict.keys():
                                self.mapping_dict[os.path.basename(
                                    current_file)] = body
                                self.file_counter += 1
                                return {os.path.basename(current_file): body}
                            else:
                                return f"Eml File: {os.path.basename(current_file)} has already been read in."
                else:
                    # UPDATE: added 6/20/2019
                    # if email is not multipart, we can extract the text directly
                    try:
                        if msg.get_content_type() == 'text/html':
                            soup = BeautifulSoup(msg.get_content(),
                                                 'html.parser')
                            body = soup.findAll(text=True)  # extract the text
                            # process the text list into a formatted string
                            body = ' '.join(body) \
                                .translate(str.maketrans('', '', string.punctuation)) \
                                .lower()
                            body = SPACES.sub(" ", body)
                            body = NEWLINE.sub("", body)
                            body = TABS.sub(" ", body)
                            body = ''.join(
                                [i if ord(i) < 128 else ' ' for i in body])
                            #print(f"body := {body}")
                            # update the mapping dict if the file is not currently in the mapping dictionary
                            if os.path.basename(
                                    current_file
                            ) not in self.mapping_dict.keys():
                                self.mapping_dict[os.path.basename(
                                    current_file)] = body
                                self.file_counter += 1
                                return {os.path.basename(current_file): body}
                            else:
                                return f"Eml File: {os.path.basename(current_file)} has already been read in."
                    except Exception as e:
                        # NOTE: *added 06/28/2019*
                        self.error_file_counter += 1
                        self.error_files.append(os.path.basename(current_file))
                        logger.error(
                            error=
                            f'Eml file: {os.path.basename(current_file)} could not be text mined.'
                        )
                        logger.error(error=e)
        except (OSError, Exception) as e:
            # update the error file information
            self.error_file_counter += 1
            self.error_files.append(os.path.basename(current_file))
            logger.error(
                error=
                f'Eml file: {os.path.basename(current_file)} could not be text mined.'
            )
            logger.error(error=e)