def extract_text(self, current_file: str) -> dict: try: with open(current_file, 'rb') as eml_file: #logger.info(info=f'Eml file: {os.path.basename(current_file)}') msg = BytesParser(policy=policy.default).parse(eml_file) if msg.is_multipart(): for part in msg.walk(): if part.get_content_type() == 'text/html': soup = BeautifulSoup(part.get_content(), 'html.parser') body = soup.findAll(text=True) # extract the text # check if the body of the eml file is None or 0 if not body: self.error_file_counter += 1 self.error_files.append( os.path.basename(current_file)) return f"No text body in email: {os.path.basename(current_file)}" else: # process the text list into a formatted string body = ' '.join(body) \ .translate(str.maketrans('', '', string.punctuation)) \ .lower() body = SPACES.sub(" ", body) body = NEWLINE.sub("", body) body = TABS.sub(" ", body) body = ''.join( [i if ord(i) < 128 else ' ' for i in body]) #print(f"body := {body}") # UPDATE: added 6/20/2019 if len(body) == 0: # not text was extracted from this file; add to error files list self.error_file_counter += 1 self.error_files.append( os.path.basename(current_file)) logger.error( error= f"Eml file: {os.path.basename(current_file)} has no text body." ) # update the mapping dict if the file is not currently in the mapping dictionary if os.path.basename( current_file ) not in self.mapping_dict.keys(): self.mapping_dict[os.path.basename( current_file)] = body self.file_counter += 1 return {os.path.basename(current_file): body} else: return f"Eml File: {os.path.basename(current_file)} has already been read in." else: # UPDATE: added 6/20/2019 # if email is not multipart, we can extract the text directly try: if msg.get_content_type() == 'text/html': soup = BeautifulSoup(msg.get_content(), 'html.parser') body = soup.findAll(text=True) # extract the text # process the text list into a formatted string body = ' '.join(body) \ .translate(str.maketrans('', '', string.punctuation)) \ .lower() body = SPACES.sub(" ", body) body = NEWLINE.sub("", body) body = TABS.sub(" ", body) body = ''.join( [i if ord(i) < 128 else ' ' for i in body]) #print(f"body := {body}") # update the mapping dict if the file is not currently in the mapping dictionary if os.path.basename( current_file ) not in self.mapping_dict.keys(): self.mapping_dict[os.path.basename( current_file)] = body self.file_counter += 1 return {os.path.basename(current_file): body} else: return f"Eml File: {os.path.basename(current_file)} has already been read in." except Exception as e: # NOTE: *added 06/28/2019* self.error_file_counter += 1 self.error_files.append(os.path.basename(current_file)) logger.error( error= f'Eml file: {os.path.basename(current_file)} could not be text mined.' ) logger.error(error=e) except (OSError, Exception) as e: # update the error file information self.error_file_counter += 1 self.error_files.append(os.path.basename(current_file)) logger.error( error= f'Eml file: {os.path.basename(current_file)} could not be text mined.' ) logger.error(error=e)