コード例 #1
0
    def extract_text(self, current_file: str) -> dict:
        try:
            with open(current_file, 'rb') as eml_file:
                #logger.info(info=f'Eml file: {os.path.basename(current_file)}')
                msg = BytesParser(policy=policy.default).parse(eml_file)
                if msg.is_multipart():
                    for part in msg.walk():
                        if part.get_content_type() == 'text/html':
                            soup = BeautifulSoup(part.get_content(),
                                                 'html.parser')
                            body = soup.findAll(text=True)  # extract the text

                            # check if the body of the eml file is None or 0
                            if not body:
                                self.error_file_counter += 1
                                self.error_files.append(
                                    os.path.basename(current_file))
                                return f"No text body in email: {os.path.basename(current_file)}"
                            else:
                                # process the text list into a formatted string
                                body = ' '.join(body) \
                                    .translate(str.maketrans('', '', string.punctuation)) \
                                    .lower()
                                body = SPACES.sub(" ", body)
                                body = NEWLINE.sub("", body)
                                body = TABS.sub(" ", body)
                                body = ''.join(
                                    [i if ord(i) < 128 else ' ' for i in body])
                                #print(f"body := {body}")
                                # UPDATE: added 6/20/2019
                                if len(body) == 0:
                                    # not text was extracted from this file; add to error files list
                                    self.error_file_counter += 1
                                    self.error_files.append(
                                        os.path.basename(current_file))
                                    logger.error(
                                        error=
                                        f"Eml file: {os.path.basename(current_file)} has no text body."
                                    )

                            # update the mapping dict if the file is not currently in the mapping dictionary
                            if os.path.basename(
                                    current_file
                            ) not in self.mapping_dict.keys():
                                self.mapping_dict[os.path.basename(
                                    current_file)] = body
                                self.file_counter += 1
                                return {os.path.basename(current_file): body}
                            else:
                                return f"Eml File: {os.path.basename(current_file)} has already been read in."
                else:
                    # UPDATE: added 6/20/2019
                    # if email is not multipart, we can extract the text directly
                    try:
                        if msg.get_content_type() == 'text/html':
                            soup = BeautifulSoup(msg.get_content(),
                                                 'html.parser')
                            body = soup.findAll(text=True)  # extract the text
                            # process the text list into a formatted string
                            body = ' '.join(body) \
                                .translate(str.maketrans('', '', string.punctuation)) \
                                .lower()
                            body = SPACES.sub(" ", body)
                            body = NEWLINE.sub("", body)
                            body = TABS.sub(" ", body)
                            body = ''.join(
                                [i if ord(i) < 128 else ' ' for i in body])
                            #print(f"body := {body}")
                            # update the mapping dict if the file is not currently in the mapping dictionary
                            if os.path.basename(
                                    current_file
                            ) not in self.mapping_dict.keys():
                                self.mapping_dict[os.path.basename(
                                    current_file)] = body
                                self.file_counter += 1
                                return {os.path.basename(current_file): body}
                            else:
                                return f"Eml File: {os.path.basename(current_file)} has already been read in."
                    except Exception as e:
                        # NOTE: *added 06/28/2019*
                        self.error_file_counter += 1
                        self.error_files.append(os.path.basename(current_file))
                        logger.error(
                            error=
                            f'Eml file: {os.path.basename(current_file)} could not be text mined.'
                        )
                        logger.error(error=e)
        except (OSError, Exception) as e:
            # update the error file information
            self.error_file_counter += 1
            self.error_files.append(os.path.basename(current_file))
            logger.error(
                error=
                f'Eml file: {os.path.basename(current_file)} could not be text mined.'
            )
            logger.error(error=e)