コード例 #1
0
def email_parser(email_file):
    html_flag = 0
    with open(email_file, 'rb') as fp:
        msg = BytesParser(policy=policy.default).parse(fp)
    # print('Subject:', msg['subject'])

    if not msg.is_multipart():
        # print("Singular email")
        if msg.get_content_maintype() == "text":
            if msg.get_content_subtype() == "plain":
                # print(msg.get_content_type())
                body = msg.get_body(preferencelist='text/plain')
                # print(body)
            elif msg.get_content_subtype() == "html":
                # print(msg.get_content_type())
                body = msg.get_body(preferencelist='html')
                # print("----Body from get_body()-------")
                # print(body)
                html_body = str(body).split("\n")[3:]
                html_body = '\n'.join(html_body)
                # print("----Parsed text through beautiful soup-------")
                body = html_parse(html_body)
                # print(body)
            else:
                print("Don't know if html or text {}".format(
                    msg.get_content_subtype()))
    else:
        print("Email is multipart")
        i = 0
        for part in msg.walk():
            i = i + 1
            print("part " + str(i))
            cdispo = str(part.get('Content-Disposition'))
            print(cdispo)
            print(part.get_content_type())
            print(part.get_content_subtype())
            if part.get_content_type(
            ) == 'multipart/alternative' or part.get_content_type(
            ) == 'multipart/related':
                body = part.get_body(preferencelist='html')
                print("----Body from get_body()-------")
                print(body)
                html_body = str(body).split("\n")[3:]
                html_body = '\n'.join(html_body)
                print("----Parsed text through beautiful soup-------")
                body = html_parse(html_body)
                print(body)
            if part.get_content_type() == 'text/plain':
                body = part.get_payload(decode=True)  # decode
                print(body)
                break
    return body
    def extract_text(self, current_file) -> dict:
        """Extract the current email's text"""
        try:
            with open(current_file, 'rb') as eml_f:
                msg = BytesParser(policy=policy.default).parse(eml_f)
                if msg.is_multipart():
                    for part in msg.walk():
                        if part.get_content_type() == 'text/html':
                            soup = BeautifulSoup(part.get_content(),
                                                 'html.parser')
                            body = soup.findAll(text=True)  # extract the text

                            # process the text list into a formatted string
                            body = ' '.join(body) \
                                .translate(str.maketrans('', '', string.punctuation)) \
                                .lower()
                            body = SPACES.sub(" ", body)
                            body = NEWLINE.sub("", body)
                            body = TABS.sub(" ", body)
                            body = ''.join(
                                [i if ord(i) < 128 else ' ' for i in body])
                            #NOTE: update for dms_claims project (5/17/19)
                            if self.project == 'dms_claims':
                                self.mapping_dict.update({})
                            #NOTE: END//

                            self.mapping_dict.update(
                                {os.path.basename(current_file): body})
                            self.file_counter += 1
                            return {os.path.basename(current_file): body}
        except OSError as e:
            if current_file in self.error_files:
                pass
            else:
                self.error_file_counter += 1
                self.error_files.append(
                    os.path.basename(current_file))  # added: 4/16/2019
                #logger.error(error=f'OSError: Could not parse email: {os.path.basename(current_file)}')
                #logger.error(error=f"Python Exception: {e}") # added: 5/1/2019
        except Exception as e:  # added: 5/1/2019
            if current_file in self.error_files:
                pass
            else:
                self.error_file_counter += 1
                self.error_files.append(os.path.basename(current_file))
コード例 #3
0
ファイル: mail.py プロジェクト: xschul/PyCIRCLeanMail
 def split_email(self, raw_email):
     parsed_email = BytesParser().parsebytes(raw_email)
     to_keep = []
     attachments = []
     if parsed_email.is_multipart():
         for p in parsed_email.get_payload():
             if p.get_filename():
                 filename = decode_header(p.get_filename())
                 if filename[0][1]:
                     filename = filename[0][0].decode(filename[0][1])
                 else:
                     filename = filename[0][0]
                 attachments.append(File(p.get_payload(decode=True), filename))
             else:
                 to_keep.append(p)
     else:
         to_keep.append(parsed_email.get_payload())
     return to_keep, attachments, parsed_email
コード例 #4
0
ファイル: mail.py プロジェクト: CIRCL/PyCIRCLeanMail
 def split_email(self, raw_email):
     parsed_email = BytesParser().parsebytes(raw_email)
     to_keep = []
     attachments = []
     if parsed_email.is_multipart():
         for p in parsed_email.get_payload():
             if p.get_filename():
                 filename = decode_header(p.get_filename())
                 if filename[0][1]:
                     filename = filename[0][0].decode(filename[0][1])
                 else:
                     filename = filename[0][0]
                 attachments.append(File(p.get_payload(decode=True), filename))
             else:
                 to_keep.append(p)
     else:
         to_keep.append(parsed_email.get_payload())
     return to_keep, attachments, parsed_email
コード例 #5
0
    def extract_text(self, current_file: str) -> dict:
        try:
            with open(current_file, 'rb') as eml_file:
                #logger.info(info=f'Eml file: {os.path.basename(current_file)}')
                msg = BytesParser(policy=policy.default).parse(eml_file)
                if msg.is_multipart():
                    for part in msg.walk():
                        if part.get_content_type() == 'text/html':
                            soup = BeautifulSoup(part.get_content(),
                                                 'html.parser')
                            body = soup.findAll(text=True)  # extract the text

                            # check if the body of the eml file is None or 0
                            if not body:
                                self.error_file_counter += 1
                                self.error_files.append(
                                    os.path.basename(current_file))
                                return f"No text body in email: {os.path.basename(current_file)}"
                            else:
                                # process the text list into a formatted string
                                body = ' '.join(body) \
                                    .translate(str.maketrans('', '', string.punctuation)) \
                                    .lower()
                                body = SPACES.sub(" ", body)
                                body = NEWLINE.sub("", body)
                                body = TABS.sub(" ", body)
                                body = ''.join(
                                    [i if ord(i) < 128 else ' ' for i in body])
                                #print(f"body := {body}")
                                # UPDATE: added 6/20/2019
                                if len(body) == 0:
                                    # not text was extracted from this file; add to error files list
                                    self.error_file_counter += 1
                                    self.error_files.append(
                                        os.path.basename(current_file))
                                    logger.error(
                                        error=
                                        f"Eml file: {os.path.basename(current_file)} has no text body."
                                    )

                            # update the mapping dict if the file is not currently in the mapping dictionary
                            if os.path.basename(
                                    current_file
                            ) not in self.mapping_dict.keys():
                                self.mapping_dict[os.path.basename(
                                    current_file)] = body
                                self.file_counter += 1
                                return {os.path.basename(current_file): body}
                            else:
                                return f"Eml File: {os.path.basename(current_file)} has already been read in."
                else:
                    # UPDATE: added 6/20/2019
                    # if email is not multipart, we can extract the text directly
                    try:
                        if msg.get_content_type() == 'text/html':
                            soup = BeautifulSoup(msg.get_content(),
                                                 'html.parser')
                            body = soup.findAll(text=True)  # extract the text
                            # process the text list into a formatted string
                            body = ' '.join(body) \
                                .translate(str.maketrans('', '', string.punctuation)) \
                                .lower()
                            body = SPACES.sub(" ", body)
                            body = NEWLINE.sub("", body)
                            body = TABS.sub(" ", body)
                            body = ''.join(
                                [i if ord(i) < 128 else ' ' for i in body])
                            #print(f"body := {body}")
                            # update the mapping dict if the file is not currently in the mapping dictionary
                            if os.path.basename(
                                    current_file
                            ) not in self.mapping_dict.keys():
                                self.mapping_dict[os.path.basename(
                                    current_file)] = body
                                self.file_counter += 1
                                return {os.path.basename(current_file): body}
                            else:
                                return f"Eml File: {os.path.basename(current_file)} has already been read in."
                    except Exception as e:
                        # NOTE: *added 06/28/2019*
                        self.error_file_counter += 1
                        self.error_files.append(os.path.basename(current_file))
                        logger.error(
                            error=
                            f'Eml file: {os.path.basename(current_file)} could not be text mined.'
                        )
                        logger.error(error=e)
        except (OSError, Exception) as e:
            # update the error file information
            self.error_file_counter += 1
            self.error_files.append(os.path.basename(current_file))
            logger.error(
                error=
                f'Eml file: {os.path.basename(current_file)} could not be text mined.'
            )
            logger.error(error=e)
コード例 #6
0
class EmailReader:
    """Creates an object for email parsing"""
    def __init__(self):
        self.emailPath = ""
        self.subjectField = ""
        self.fromField = ""
        self.toField = ""
        self.htmlBody = ""
        self.textBody = ""
        self.replyTo = ""
        self.returnPath = ""

    def readEmail(self, emailPath):
        """Reads an email for parsing"""
        f = open(emailPath, "rb")
        self.msg = BytesParser(policy=policy.default).parse(f)
        f.close()

    def getFrom(self, mode="address"):
        """Gets the from field.
        :param mode: what type of way in getting the from field
        address -> Returns only the address
        name -> Returns only the name
        full -> Returns both the name and address
        """
        fromField = self.msg["From"]
        if mode == "full":
            return fromField
        elif mode == "address":
            if "<" in fromField:
                temp = fromField.split("<")[-1][:-1]
                return temp
            else:
                return ""

        elif mode == "name":
            if "<" in fromField:
                temp = fromField.split("<")[0]
                return temp.strip()
            else:
                return ""
        else:
            raise Exception(
                "Parameter is undefined!\nAvailable options are only: \"address\", \"name\", and \"full\""
            )

    def getSubject(self):
        """Gets the subject field"""
        return self.msg["Subject"]

    def getReplyTo(self):
        """Gets the Reply-To field"""
        return self.msg["Reply-To"]

    def getReturnPath(self):
        """Gets the Return-Path field"""
        return self.msg["Return-Path"]

    def getHeader(self, header=""):
        """Gets any header"""
        if header == "":
            return ""
        else:
            try:
                return self.msg[header]
            except:
                return ""

    def getBody(self, mode="all"):
        """Gets the body.
        :param mode: what type of way in getting the email's body.
        all -> Returns both html and text
        html -> Returns only the html
        text -> Returns only the text
        """
        htmlBody = ""
        textBody = ""
        if self.msg.is_multipart():
            # Iterate for each part and check if it's the "body" part, text or html
            for part in self.msg.walk():
                # Check if its HTML and it is not an attachment
                if part.get_content_type(
                ) == "text/html" and part.get_content_disposition(
                ) != "attachment":
                    # Store the part in "s" variable in standard latin-1 encoding
                    self.htmlBody = part.get_payload(
                        decode=True).decode('ISO-8859-1')
                    htmlBody = self.htmlBody
                    # Since this is in HTML format, we need to strip all the HTML tags, we use BeautifulSoup
                    # For plain text and not an attachment
                if part.get_content_type(
                ) == "text/plain" and part.get_content_disposition(
                ) != "attachment":
                    # Place the text part to "s" variable in standard latin-1 encoding
                    self.textBody = part.get_payload(
                        decode=True).decode('ISO-8859-1')
                    textBody = self.textBody

        if mode == "all":
            return htmlBody, textBody
        elif mode == "html":
            return htmlBody
        elif mode == "text":
            return textBody
コード例 #7
0
    print("Nebyl uveden žádný soubor")

argument = 1
while (argument < len(sys.argv)):
    skore = 1
    with open(sys.argv[argument], 'rb') as fp:
        msg = BytesParser(policy=policy.default).parse(fp)
    text = ""
    text2 = ""
    try:
        try:
            text = msg.get_body(
                preferencelist=('plain')).get_content()  # čistý text
            #print("metoda 1")
        except:
            if msg.is_multipart():
                for payload in msg.get_payload():
                    #print("metoda 2a")
                    # if payload.is_multipart(): ...
                    text2 = payload.get_payload()
            else:
                text2 = msg.get_payload()
                #print("metoda 2b")
            text = html2text.html2text(text2)
    except:
        text = ""
    #print(text2)
    text = text.replace('\n', ' ')
    #print(text)
    odesilatel = msg['from']  # odesílatel
    prijemce = msg['to']  # příjemce