def do_walk(self): """ do_walk is the main function of the module. :return: """ # If this is a TOMES_TOOL Struct use the folder_map if self.from_tomes: self.account_directory = os.path.join(self.data_dir, self.account_name) # Did someone make a mistake? Check to make sure folder_map is there if os.path.exists( os.path.join(self.account_directory, "folder_map.tsv")): self._build_folder_map( os.path.join(self.account_directory, "folder_map.tsv")) else: CommonMethods.set_from_tomes(False) print("Scanning data structure for emails.") for root, dirs, files in os.walk(self.account_directory): for f in files: if root not in self.message_pack: self.message_pack[root] = [] if f.endswith("eml"): self.message_pack[root].append(f) self.process_folders()
def _write_file(self): try: fh = codecs.open(self.current_eaxs_file, "ab", "utf-8") fh.write(self.get_root_element_attributes()) fh.close() CommonMethods.set_eaxs_file(self.current_eaxs_file) except FileNotFoundError as e: self.logger.error("{}: {}".format(e, self.current_eaxs_file))
def write_global_id(self): try: fh = codecs.open(CommonMethods.get_eaxs_filename(), "ab", "utf-8") fh.write(self.get_id()) fh.close() CommonMethods.set_eaxs_file(CommonMethods.get_eaxs_filename()) except FileNotFoundError as e: self.logger.error("{}: {}".format( e, CommonMethods.get_eaxs_filename()))
def _process_message(self, mes): if CommonMethods.get_tomes_tool(): e_msg = DmMessage(self.expand_path_from_map(self.current_relpath), CommonMethods.increment_local_id(), mes, self.cur_fn) else: e_msg = DmMessage(self.current_relpath, CommonMethods.increment_local_id(), mes, self.cur_fn) e_msg.message = None self.messages.append(e_msg)
def __init__(self): """Constructor for ExtBodyContent""" self.attachment_folder = CommonMethods.get_attachment_directory() self.attachment_directory = os.path.join( CommonMethods.get_attachment_directory(), self.attachment_folder) self.rel_path = None # type: str self.char_set = None # type: str self.transfer_encoding = None # type: str self.local_id = None # type: int self.xml_wrapped = True # type: bool self.eol = None # type: Eol self.hash = None # type: Hash self.body_content = None # type: str self.gid = uuid.uuid4() # type: uuid self.logger = logging.getLogger("ExtBodyContent")
def _simple_ext_body(self): extbody = ExtBodyContent() extbody.local_id = CommonMethods.increment_local_id() extbody.transfer_encoding = self.transfer_encoding extbody.hash = CommonMethods.get_hash(bytes(self.body_content, encoding='utf-8')) children = OrderedDict({ "ContentType": self.content_type, "Disposition": self.disposition, "DispositionFileName": self.disposition_file_name, "ContentTransferEncoding": self.transfer_encoding }) extbody.build_xml_file(children) self.ext_body_content.append(extbody) self.payload = None self.body_content = None
def process_folders(self): for path, files in self.message_pack.items(): self.current_relpath = self.get_rel_path(path) for f in files: if CommonMethods.get_chunksize( ) != 0 and CommonMethods.get_chunksize() == self.chunks: # Render the folder and reopen self._fldr_render_reopen(path) self.chunks = 0 self.cur_fn = f self.message_generator(os.path.join(path, f)) self._fldr_render(path) self.account.close_account() if CommonMethods.get_stitch(): self.account.stitch_account()
def render(self, parent): """ :type parent: xml.etree.ElementTree.Element :param parent: :return: """ multi_child_head = etree.SubElement(parent, "MultiBody") for key, value in CommonMethods.get_multibody_map().items(): if self.__getattribute__(key) is not None: if isinstance(self.__getattribute__(key), list): # TODO: Handle this for item in self.__getattribute__(key): if isinstance(item, SingleBody): item.render(multi_child_head) if isinstance(item, MultiBody): item.render(multi_child_head) continue continue child = etree.SubElement(multi_child_head, value) child.text = self.__getattribute__(key) continue if key == 'charset' or key == 'boundary_string': # This is stupid but is required by the schema child = etree.SubElement(multi_child_head, value) child.text = self.__getattribute__(key)
def render(self, parent=None): """ :type parent: Element :param parent: :return: """ if parent is not None: self.local_id = str(self.local_id) message = etree.SubElement(parent, "Message") for key, value in CommonMethods.get_messagetype_map().items(): if self.__getattribute__(key) is not None: if isinstance(self.__getattribute__(key), list): #TODO: Handle this for item in self.__getattribute__(key): if isinstance(item, Header): item.render(message) if isinstance(item, MultiBody): item.render(message) continue if isinstance(self.__getattribute__(key), Hash): self.__getattribute__(key).render(message) continue if isinstance(self.__getattribute__(key), MultiBody): self.__getattribute__(key).render(message) continue child = etree.SubElement(message, value) child.text = self.__getattribute__(key)
def render(self, parent): """ :type parent: xml.etree.ElementTree.Element :param parent: :return: """ single_child_head = etree.SubElement(parent, "SingleBody") for key, value in CommonMethods.get_singlebody_map().items(): if self.__getattribute__(key) is not None: if isinstance(self.__getattribute__(key), list): if len(self.__getattribute__(key)) == 0: continue if isinstance(self.__getattribute__(key)[0], ExtBodyContent): for ebc in self.ext_body_content: ebc.render(single_child_head) continue if isinstance(self.__getattribute__(key)[0], IntBodyContent): for intb in self.body_content: intb.render(single_child_head) continue continue child = etree.SubElement(single_child_head, value) try: child.text = self.__getattribute__(key) except TypeError as e: pass
def _set_vars(self): CommonMethods.set_store_rtf_body(False) CommonMethods.init_hash_dict() CommonMethods.set_dedupe() CommonMethods.set_base_path(self.base_dir) self.eaxs = os.path.join(self.base_dir, 'eaxs') self.mboxes = os.path.join(self.base_dir, 'mboxes') self.emls = os.path.join(self.base_dir, 'emls') self.psts = os.path.join(self.base_dir, 'pst')
def process_headers(self): if isinstance(self.payload, str): self.body_content = self.payload self.body_only = True return for header, value in self.payload.items(): if header == "Content-Type": expression = CommonMethods.get_content_type(value) if len(expression) > 1: self.content_type = expression[0] # Is this a charset identification if expression[1] == 'charset': self.charset = expression[2] else: self.content_type_param.append(Parameter(expression[1], expression[2])) continue else: self.content_type = expression[0] continue if header == "Content-Transfer-Encoding": self.transfer_encoding = value continue if header == "Content-Disposition": try: self.disposition = value.split(";")[0] fn = value.split(";")[1].split("=")[1] if len(fn.split("''")) > 1: self.disposition_file_name = unquote(fn.split("''")[1]) else: self.disposition_file_name = unquote(fn) continue except IndexError as e: self.other_mime_header.append(Header(header, value)) if header == "Content-ID": self.content_id = CommonMethods.cdata_wrap(value) continue if header == "Content-Description": self.content_name = value continue self.other_mime_header.append(Header(header, value))
def _full_ext_body(self): extbody = ExtBodyContent() extbody.char_set = self.charset extbody.local_id = CommonMethods.increment_local_id() extbody.gid = "{0:0>5}_{1}".format(extbody.local_id, extbody.gid) extbody.transfer_encoding = self.transfer_encoding extbody.eol = CommonMethods.get_eol(self.payload.get_payload()) extbody.hash = CommonMethods.get_hash(self.payload.as_bytes()) extbody.body_content = self.payload.get_payload() children = OrderedDict({ "ContentType": self.content_type, "Disposition": self.disposition, "DispositionFileName": self.disposition_file_name, "ContentTransferEncoding": self.transfer_encoding }) extbody.build_xml_file(children) self.ext_body_content.append(extbody) self.payload = None
def process_headers(self): for header, value in self.payload.items(): if header == "Content-Type": expression = CommonMethods.get_content_type(value) if len(expression) == 3: self.content_type = expression[0] self.boundary_string = expression[2] else: self.content_type = expression[0]
def build_xml_file(self, children): """ :type children : OrderedDict :param children: :return: """ if CommonMethods.get_dedupe(): self._build_dedup(children) else: self._build_nodedup(children)
def __init__(self, relpath, mbox_path): """Constructor for Folder""" if CommonMethods.is_eml_struct(): self.name = relpath.split(os.path.sep)[-1] self.relpath = relpath else: self.name = mbox_path.split(os.sep)[-2] # type: str self.relpath = relpath self.messages = [] # type: list[DmMessage] self.folders = [] # type: list[Folder] self.mbox_size = os.path.getsize(mbox_path)
def do_walk(self): self.start_account() for path in self.mboxes: self.current_relpath = self.get_rel_path(path) self.logger.info('Processing folder found at: {}'.format(path)) self.new_folder = False self.mbx = None self.message_generator(path) self._fldr_render_continue(path) self.close_account() if CommonMethods.get_stitch(): self.account.stitch_account()
def _store_body(self): # Checks to see if the ExtBody is a duplicate of the email body. # Remove and note in the ExtBody Disposition. if self.disposition_file_name != "rtf-body.rtf": return True if self.content_type.__contains__("richtext"): return True elif not CommonMethods.store_rtf_body(): # Check to see if we have flagged to save body duplicates self.disposition_comments = "Attachment is duplicate of BodyContent: Not saved" return False return True
def message_generator(self, path): """ This is the main method that extracts email messages from an mbox. :type path: str :param path: :return: """ b_mark = None buff = [] with open(path, 'rb') as fh: # Open the mbox found at path while True: line = CommonMethods.sanitize(fh.readline()) if len(line) == 0: # Clunky ass way to find end of file, but whatevs. write the final message and clear # buffer. self._transform_buffer(buff, path) buff = [] break if re.search(b'^From((\s(\"|.+).+\@)|(\s(\".+\")\s))', line): # Per RFC if b_mark is None: # Found the beginning of a message # set the beginning bit, and put everything else, until the next 'From ' block, # into a buffer. b_mark = 1 else: # Process the buffered message into an email.message.Message object b_mark = None if CommonMethods.get_chunksize( ) != 0 and CommonMethods.get_chunksize( ) == self.chunks: # Render the folder and reopen self._fldr_render_reopen(path) self.chunks = 0 self._transform_buffer(buff, path, fh.tell()) buff = [] buff.append(line)
def __init__(self, acct_directory, xml_dir, acct_name): self.account_name = acct_name self.account_directory = acct_directory self.xml_dir = xml_dir self.account = Account(acct_name, xml_dir) self.current_folder = None self.messages = [] self.current_relpath = None # type: str self.total_messages_processed = 0 self.logger = logging.getLogger("EmlWalker") self.message_pack = DefaultListOrderedDict() self.account.start_account() self.account.write_global_id() self.chunks = 0 self.new_account = True self.from_tomes = CommonMethods.get_tomes_tool() self.data_dir = os.path.join(CommonMethods.get_process_paths(), "mboxes") self.folder_map = {} self.expanded_path = str self.new_dir = True self.cur_fn = str
def __init__(self, root_level, xml_dir, account_name): """Constructor for DirectoryWalker""" self.mbx = None # type: mailbox.mbox self.root = root_level self.folders = {} self.messages = [] self.current_relpath = None # type: str self.xml_dir = xml_dir self.account = Account(account_name, xml_dir) self.logger = logging.getLogger("MboxWalker") self.total_messages_processed = 0 # type: int self.chunks = CommonMethods.get_chunksize() # type: int self.tracking_pos = 0 # type: int self.messages_in_folder = 0 # type: int self.messages_no_start_fldr = 0 # type: int self.message_no_end_flder = 0 # type: int self.new_account = True self.mboxes = [] # type: list self.new_folder = False self.mesg_begin = re.compile('^From((\s(\"|.+).+\@)|(\s(\".+\")\s))') self.json_folders = [] if CommonMethods.get_store_json(): self.json_write = CommonMethods.get_json_directory()
def write_ext_body(self, xml): if self.xml_wrapped: try: fn = '{}.xml'.format(self.gid) self.rel_path = ".{}".format( os.path.join(CommonMethods.get_rel_attachment_dir(), fn)) fh = codecs.open(os.path.join(self.attachment_directory, fn), "w", "utf-8") fh.write(xml) fh.close() except UnicodeDecodeError as e: self.logger.error(e) except UnicodeEncodeError as e: self.logger.error(e)
def _build_dedup(self, children): """ :type children : OrderedDict :param children: :return: """ if CommonMethods.set_ext_hash(self.gid, self.hash): chillen = OrderedDict() chillen["LocalUniqueID"] = self.gid.__str__() for k, v in children.items(): chillen[k] = str(v).strip("\"") chillen["Content"] = self.body_content rend = Render("ExternalBodyPart", chillen) text = rend.render() self.write_ext_body(text) self.body_content = None else: self.gid = CommonMethods.get_ext_gid(self.hash.value) self.rel_path = ".{}{}{}{}.xml".format( os.sep, CommonMethods.get_rel_attachment_dir(), os.sep, self.gid.__str__()) self.body_content = None self.logger.info("Duplicate Attachment: {}".format( self.gid.__str__()))
def render(self, parent): """ :type parent: xml.etree.ElementTree.Element :param parent: :return: """ child = etree.SubElement(parent, "Header") child1 = etree.SubElement(child, "Name") child1.text = self.name child2 = etree.SubElement(child, "Value") try: child2.text = CommonMethods.cdata_wrap(self.value) except ValueError as ve: pass except TypeError as te: pass
def render(self): folder = etree.Element("Folder") name = etree.SubElement(folder, "Name") name.text = self.name if len(self.messages) > 0: for mes in self.messages: """ :type mes: DmMessage """ try: mes.render(folder) except AttributeError as e: pass outfile = open(CommonMethods.get_eaxs_filename(), "ab") etree.ElementTree(folder).write(outfile, encoding="utf-8", pretty_print=True) folder = None
def _fldr_render(self, path): fldr = Folder(self.current_relpath, path) fldr.messages = self.messages fldr.render() if CommonMethods.get_store_json(): fh = open(os.path.join(self.json_write, fldr.name + ".json"), 'w', encoding='utf-8') fh.write(',') jsn = fldr.render_json() json.dump(jsn, fh) fh.close() self.logger.info('Wrote folder of size {} bytes'.format( fldr.mbox_size)) self.logger.info('Messages processed: {}'.format( self.total_messages_processed)) fldr = None self.messages = [] gc.collect()
def _process_plaintext_body(self): t = "" if isinstance(self.payload, Message): t = re.sub("\[\[", "\\[\\[", self.payload.get_payload()) t = re.sub("]]", "\]\]", t) elif isinstance(self.payload, str): t = re.sub("\[\[", "\\[\\[", self.payload) t = re.sub("]]", "\]\]", t) s = sys.getsizeof(t) if s > (1024 ** 2): # This is probably not a plaintext payload. Punt to external body. self._full_ext_body() return try: sbint = IntBodyContent(CommonMethods.cdata_wrap(t), self.transfer_encoding, self.charset) if sbint.content == '' or sbint is None: self.payload = None return self.body_content.append(sbint) except ValueError as ve: self.logger.error("{}".format(ve)) self.payload = None
def _get_keywords(self): if isinstance(self.keywords, etree.CDATA): return CommonMethods.cdata_unwrap(self.keywords) return self.keywords
def _get_content_id(self): if self.content_id is not None: if isinstance(self.content_id, etree.CDATA): return CommonMethods.cdata_unwrap(self.content_id) return self.content_id return str()
def __init__(self, rel_path, local_id, message, fn=None): """Constructor for Message""" self.logger = logging.getLogger("MessageType") self.message = message # type: Message self.fn = fn # First parts of the schema message-type self.relative_path = rel_path # type: str self.local_id = local_id self.message_id = CommonMethods.cdata_wrap( self.message.get("Message-ID")) # type: str if self.message_id == '' or self.message_id is None: self.message_id = 'No Message-ID supplied' self.mime_version = CommonMethods.cdata_wrap( self.message.get("MIME-Version")) # type: str self.incomplete = [] # type: list[IncompleteParse] # xm:message-headers xml_d = CommonMethods.tup_to_xml_date( CommonMethods.parsedate_tz(self.message.get("Date"))) self.orig_date = xml_d # type: str self.m_from = CommonMethods.cdata_wrap( self.message.get("From")) # type: str self.sender = CommonMethods.cdata_wrap( self.message.get("Sender")) # type: str try: self.m_to = CommonMethods.cdata_wrap( self.message.get("To")) # type: str except TypeError as te: self.logger.error("{}".format(te)) self.incomplete.append( IncompleteParse('TypeError parsing To Header', te)) self.cc = CommonMethods.cdata_wrap(self.message.get("Cc")) # type: str self.bcc = CommonMethods.cdata_wrap( self.message.get("Bcc")) # type: str self.in_reply_to = CommonMethods.cdata_wrap( self.message.get("In-Reply-To")) self.references = CommonMethods.cdata_wrap( self.message.get("References")) # type: str self.comments = CommonMethods.cdata_wrap( self.message.get("Comments")) # type: str self.keywords = CommonMethods.cdata_wrap( self.message.get("Keywords")) # type: str try: self.subject = CommonMethods.cdata_wrap( self.message.get("Subject")) # type: str except TypeError as te: self.logger.error("{}".format(te)) self.incomplete.append( IncompleteParse('TypeError parsing Subject line', te)) try: self.status_flag = status.get( self.message.get("Status")) # type: str except Exception as e: self.incomplete.append( IncompleteParse('TypeError parsing Status', e)) self.headers = [] # type: list[Header] self.single_body = [] # type: list[SingleBody] self.multiple_body = [] # type: list[MultiBody] try: self.eol = CommonMethods.get_eol( self.message.as_string(policy=self.message.policy.clone( utf8=True))) # type: str except KeyError as e: self.logger.error("Inspect Message: KeyError {}".format( self.message.get("Message-ID"))) self.incomplete.append(IncompleteParse('KeyError parsing EOL', e)) except UnicodeEncodeError as ue: print(sys.gettrace()) self.logger.error("Inspect Message: UnicodeEncodeError {}".format( self.message.get("Message-ID"))) self.incomplete.append( IncompleteParse('UnicodeEncodeError parsing EOL', ue)) except LookupError as le: self.logger.error("Inspect Message: LookupError {}".format( self.message.get("Message-ID"))) self.incomplete.append( IncompleteParse('LookupError parsing EOL', le)) except Exception as er: self.incomplete.append( IncompleteParse('LookupError parsing EOL', er)) finally: self.eol = 'LF' self.hash = CommonMethods.get_hash( self.message.as_bytes()) # type: Hash self._process_headers() self._process_payload()