Example #1
0
    def do_walk(self):
        """
        do_walk is the main function of the module.

        :return:
        """
        # If this is a TOMES_TOOL Struct use the folder_map
        if self.from_tomes:
            self.account_directory = os.path.join(self.data_dir,
                                                  self.account_name)
            # Did someone make a mistake? Check to make sure folder_map is there
            if os.path.exists(
                    os.path.join(self.account_directory, "folder_map.tsv")):
                self._build_folder_map(
                    os.path.join(self.account_directory, "folder_map.tsv"))
            else:
                CommonMethods.set_from_tomes(False)

        print("Scanning data structure for emails.")
        for root, dirs, files in os.walk(self.account_directory):
            for f in files:
                if root not in self.message_pack:
                    self.message_pack[root] = []
                if f.endswith("eml"):
                    self.message_pack[root].append(f)
        self.process_folders()
 def _write_file(self):
     try:
         fh = codecs.open(self.current_eaxs_file, "ab", "utf-8")
         fh.write(self.get_root_element_attributes())
         fh.close()
         CommonMethods.set_eaxs_file(self.current_eaxs_file)
     except FileNotFoundError as e:
         self.logger.error("{}: {}".format(e, self.current_eaxs_file))
 def write_global_id(self):
     try:
         fh = codecs.open(CommonMethods.get_eaxs_filename(), "ab", "utf-8")
         fh.write(self.get_id())
         fh.close()
         CommonMethods.set_eaxs_file(CommonMethods.get_eaxs_filename())
     except FileNotFoundError as e:
         self.logger.error("{}: {}".format(
             e, CommonMethods.get_eaxs_filename()))
Example #4
0
 def _process_message(self, mes):
     if CommonMethods.get_tomes_tool():
         e_msg = DmMessage(self.expand_path_from_map(self.current_relpath),
                           CommonMethods.increment_local_id(), mes,
                           self.cur_fn)
     else:
         e_msg = DmMessage(self.current_relpath,
                           CommonMethods.increment_local_id(), mes,
                           self.cur_fn)
     e_msg.message = None
     self.messages.append(e_msg)
Example #5
0
 def __init__(self):
     """Constructor for ExtBodyContent"""
     self.attachment_folder = CommonMethods.get_attachment_directory()
     self.attachment_directory = os.path.join(
         CommonMethods.get_attachment_directory(), self.attachment_folder)
     self.rel_path = None  # type: str
     self.char_set = None  # type: str
     self.transfer_encoding = None  # type: str
     self.local_id = None  # type: int
     self.xml_wrapped = True  # type: bool
     self.eol = None  # type: Eol
     self.hash = None  # type: Hash
     self.body_content = None  # type: str
     self.gid = uuid.uuid4()  # type: uuid
     self.logger = logging.getLogger("ExtBodyContent")
Example #6
0
 def _simple_ext_body(self):
     extbody = ExtBodyContent()
     extbody.local_id = CommonMethods.increment_local_id()
     extbody.transfer_encoding = self.transfer_encoding
     extbody.hash = CommonMethods.get_hash(bytes(self.body_content, encoding='utf-8'))
     children = OrderedDict({
         "ContentType": self.content_type,
         "Disposition": self.disposition,
         "DispositionFileName": self.disposition_file_name,
         "ContentTransferEncoding": self.transfer_encoding
     })
     extbody.build_xml_file(children)
     self.ext_body_content.append(extbody)
     self.payload = None
     self.body_content = None
Example #7
0
 def process_folders(self):
     for path, files in self.message_pack.items():
         self.current_relpath = self.get_rel_path(path)
         for f in files:
             if CommonMethods.get_chunksize(
             ) != 0 and CommonMethods.get_chunksize() == self.chunks:
                 # Render the folder and reopen
                 self._fldr_render_reopen(path)
                 self.chunks = 0
             self.cur_fn = f
             self.message_generator(os.path.join(path, f))
         self._fldr_render(path)
     self.account.close_account()
     if CommonMethods.get_stitch():
         self.account.stitch_account()
Example #8
0
    def render(self, parent):
        """
        :type parent: xml.etree.ElementTree.Element
        :param parent:
        :return:
        """
        multi_child_head = etree.SubElement(parent, "MultiBody")
        for key, value in CommonMethods.get_multibody_map().items():

            if self.__getattribute__(key) is not None:
                if isinstance(self.__getattribute__(key), list):
                    # TODO: Handle this
                    for item in self.__getattribute__(key):
                        if isinstance(item, SingleBody):
                            item.render(multi_child_head)
                        if isinstance(item, MultiBody):
                            item.render(multi_child_head)
                        continue
                    continue
                child = etree.SubElement(multi_child_head, value)
                child.text = self.__getattribute__(key)
                continue
            if key == 'charset' or key == 'boundary_string':
                # This is stupid but is required by the schema
                child = etree.SubElement(multi_child_head, value)
                child.text = self.__getattribute__(key)
    def render(self, parent=None):
        """
        :type parent: Element

        :param parent:
        :return:
        """
        if parent is not None:
            self.local_id = str(self.local_id)
            message = etree.SubElement(parent, "Message")
            for key, value in CommonMethods.get_messagetype_map().items():
                if self.__getattribute__(key) is not None:
                    if isinstance(self.__getattribute__(key), list):
                        #TODO: Handle this
                        for item in self.__getattribute__(key):
                            if isinstance(item, Header):
                                item.render(message)
                            if isinstance(item, MultiBody):
                                item.render(message)
                        continue
                    if isinstance(self.__getattribute__(key), Hash):
                        self.__getattribute__(key).render(message)
                        continue
                    if isinstance(self.__getattribute__(key), MultiBody):
                        self.__getattribute__(key).render(message)
                        continue
                    child = etree.SubElement(message, value)
                    child.text = self.__getattribute__(key)
Example #10
0
 def render(self, parent):
     """
     :type parent: xml.etree.ElementTree.Element
     :param parent:
     :return:
     """
     single_child_head = etree.SubElement(parent, "SingleBody")
     for key, value in CommonMethods.get_singlebody_map().items():
         if self.__getattribute__(key) is not None:
             if isinstance(self.__getattribute__(key), list):
                 if len(self.__getattribute__(key)) == 0:
                     continue
                 if isinstance(self.__getattribute__(key)[0], ExtBodyContent):
                     for ebc in self.ext_body_content:
                         ebc.render(single_child_head)
                     continue
                 if isinstance(self.__getattribute__(key)[0], IntBodyContent):
                     for intb in self.body_content:
                         intb.render(single_child_head)
                     continue
                 continue
             child = etree.SubElement(single_child_head, value)
             try:
                 child.text = self.__getattribute__(key)
             except TypeError as e:
                 pass
 def _set_vars(self):
     CommonMethods.set_store_rtf_body(False)
     CommonMethods.init_hash_dict()
     CommonMethods.set_dedupe()
     CommonMethods.set_base_path(self.base_dir)
     self.eaxs = os.path.join(self.base_dir, 'eaxs')
     self.mboxes = os.path.join(self.base_dir, 'mboxes')
     self.emls = os.path.join(self.base_dir, 'emls')
     self.psts = os.path.join(self.base_dir, 'pst')
Example #12
0
    def process_headers(self):
        if isinstance(self.payload, str):
            self.body_content = self.payload
            self.body_only = True
            return

        for header, value in self.payload.items():

            if header == "Content-Type":
                expression = CommonMethods.get_content_type(value)
                if len(expression) > 1:
                    self.content_type = expression[0]
                    # Is this a charset identification
                    if expression[1] == 'charset':
                        self.charset = expression[2]
                    else:
                        self.content_type_param.append(Parameter(expression[1], expression[2]))
                    continue
                else:
                    self.content_type = expression[0]
                    continue
            if header == "Content-Transfer-Encoding":
                self.transfer_encoding = value
                continue
            if header == "Content-Disposition":
                try:
                    self.disposition = value.split(";")[0]
                    fn = value.split(";")[1].split("=")[1]
                    if len(fn.split("''")) > 1:
                        self.disposition_file_name = unquote(fn.split("''")[1])
                    else:
                        self.disposition_file_name = unquote(fn)
                    continue
                except IndexError as e:
                    self.other_mime_header.append(Header(header, value))

            if header == "Content-ID":
                self.content_id = CommonMethods.cdata_wrap(value)
                continue

            if header == "Content-Description":
                self.content_name = value
                continue

            self.other_mime_header.append(Header(header, value))
Example #13
0
 def _full_ext_body(self):
     extbody = ExtBodyContent()
     extbody.char_set = self.charset
     extbody.local_id = CommonMethods.increment_local_id()
     extbody.gid = "{0:0>5}_{1}".format(extbody.local_id, extbody.gid)
     extbody.transfer_encoding = self.transfer_encoding
     extbody.eol = CommonMethods.get_eol(self.payload.get_payload())
     extbody.hash = CommonMethods.get_hash(self.payload.as_bytes())
     extbody.body_content = self.payload.get_payload()
     children = OrderedDict({
         "ContentType": self.content_type,
         "Disposition": self.disposition,
         "DispositionFileName": self.disposition_file_name,
         "ContentTransferEncoding": self.transfer_encoding
     })
     extbody.build_xml_file(children)
     self.ext_body_content.append(extbody)
     self.payload = None
Example #14
0
 def process_headers(self):
     for header, value in self.payload.items():
         if header == "Content-Type":
             expression = CommonMethods.get_content_type(value)
             if len(expression) == 3:
                 self.content_type = expression[0]
                 self.boundary_string = expression[2]
             else:
                 self.content_type = expression[0]
Example #15
0
 def build_xml_file(self, children):
     """
     :type children : OrderedDict
     :param children:
     :return:
     """
     if CommonMethods.get_dedupe():
         self._build_dedup(children)
     else:
         self._build_nodedup(children)
 def __init__(self, relpath, mbox_path):
     """Constructor for Folder"""
     if CommonMethods.is_eml_struct():
         self.name = relpath.split(os.path.sep)[-1]
         self.relpath = relpath
     else:
         self.name = mbox_path.split(os.sep)[-2]  # type: str
         self.relpath = relpath
     self.messages = []  # type: list[DmMessage]
     self.folders = []  # type: list[Folder]
     self.mbox_size = os.path.getsize(mbox_path)
Example #17
0
 def do_walk(self):
     self.start_account()
     for path in self.mboxes:
         self.current_relpath = self.get_rel_path(path)
         self.logger.info('Processing folder found at: {}'.format(path))
         self.new_folder = False
         self.mbx = None
         self.message_generator(path)
         self._fldr_render_continue(path)
     self.close_account()
     if CommonMethods.get_stitch():
         self.account.stitch_account()
Example #18
0
 def _store_body(self):
     # Checks to see if the ExtBody is a duplicate of the email body.
     # Remove and note in the ExtBody Disposition.
     if self.disposition_file_name != "rtf-body.rtf":
         return True
     if self.content_type.__contains__("richtext"):
         return True
     elif not CommonMethods.store_rtf_body():
         # Check to see if we have flagged to save body duplicates
         self.disposition_comments = "Attachment is duplicate of BodyContent: Not saved"
         return False
     return True
Example #19
0
    def message_generator(self, path):
        """
        This is the main method that extracts email messages from an mbox.
        :type path: str
        :param path:
        :return:
        """
        b_mark = None
        buff = []

        with open(path, 'rb') as fh:
            # Open the mbox found at path
            while True:
                line = CommonMethods.sanitize(fh.readline())
                if len(line) == 0:
                    # Clunky ass way to find end of file, but whatevs. write the final message and clear
                    # buffer.
                    self._transform_buffer(buff, path)
                    buff = []
                    break
                if re.search(b'^From((\s(\"|.+).+\@)|(\s(\".+\")\s))', line):
                    # Per RFC
                    if b_mark is None:
                        # Found the beginning of a message
                        # set the beginning bit, and put everything else, until the next 'From ' block,
                        # into a buffer.
                        b_mark = 1
                    else:
                        # Process the buffered message into an email.message.Message object
                        b_mark = None
                        if CommonMethods.get_chunksize(
                        ) != 0 and CommonMethods.get_chunksize(
                        ) == self.chunks:
                            # Render the folder and reopen
                            self._fldr_render_reopen(path)
                            self.chunks = 0
                        self._transform_buffer(buff, path, fh.tell())
                        buff = []
                buff.append(line)
Example #20
0
 def __init__(self, acct_directory, xml_dir, acct_name):
     self.account_name = acct_name
     self.account_directory = acct_directory
     self.xml_dir = xml_dir
     self.account = Account(acct_name, xml_dir)
     self.current_folder = None
     self.messages = []
     self.current_relpath = None  # type: str
     self.total_messages_processed = 0
     self.logger = logging.getLogger("EmlWalker")
     self.message_pack = DefaultListOrderedDict()
     self.account.start_account()
     self.account.write_global_id()
     self.chunks = 0
     self.new_account = True
     self.from_tomes = CommonMethods.get_tomes_tool()
     self.data_dir = os.path.join(CommonMethods.get_process_paths(),
                                  "mboxes")
     self.folder_map = {}
     self.expanded_path = str
     self.new_dir = True
     self.cur_fn = str
Example #21
0
 def __init__(self, root_level, xml_dir, account_name):
     """Constructor for DirectoryWalker"""
     self.mbx = None  # type: mailbox.mbox
     self.root = root_level
     self.folders = {}
     self.messages = []
     self.current_relpath = None  # type: str
     self.xml_dir = xml_dir
     self.account = Account(account_name, xml_dir)
     self.logger = logging.getLogger("MboxWalker")
     self.total_messages_processed = 0  # type: int
     self.chunks = CommonMethods.get_chunksize()  # type: int
     self.tracking_pos = 0  # type: int
     self.messages_in_folder = 0  # type: int
     self.messages_no_start_fldr = 0  # type: int
     self.message_no_end_flder = 0  # type: int
     self.new_account = True
     self.mboxes = []  # type: list
     self.new_folder = False
     self.mesg_begin = re.compile('^From((\s(\"|.+).+\@)|(\s(\".+\")\s))')
     self.json_folders = []
     if CommonMethods.get_store_json():
         self.json_write = CommonMethods.get_json_directory()
Example #22
0
 def write_ext_body(self, xml):
     if self.xml_wrapped:
         try:
             fn = '{}.xml'.format(self.gid)
             self.rel_path = ".{}".format(
                 os.path.join(CommonMethods.get_rel_attachment_dir(), fn))
             fh = codecs.open(os.path.join(self.attachment_directory, fn),
                              "w", "utf-8")
             fh.write(xml)
             fh.close()
         except UnicodeDecodeError as e:
             self.logger.error(e)
         except UnicodeEncodeError as e:
             self.logger.error(e)
Example #23
0
 def _build_dedup(self, children):
     """
    :type children : OrderedDict
    :param children:
    :return:
    """
     if CommonMethods.set_ext_hash(self.gid, self.hash):
         chillen = OrderedDict()
         chillen["LocalUniqueID"] = self.gid.__str__()
         for k, v in children.items():
             chillen[k] = str(v).strip("\"")
         chillen["Content"] = self.body_content
         rend = Render("ExternalBodyPart", chillen)
         text = rend.render()
         self.write_ext_body(text)
         self.body_content = None
     else:
         self.gid = CommonMethods.get_ext_gid(self.hash.value)
         self.rel_path = ".{}{}{}{}.xml".format(
             os.sep, CommonMethods.get_rel_attachment_dir(), os.sep,
             self.gid.__str__())
         self.body_content = None
         self.logger.info("Duplicate Attachment: {}".format(
             self.gid.__str__()))
 def render(self, parent):
     """
      :type parent: xml.etree.ElementTree.Element
      :param parent:
      :return:
      """
     child = etree.SubElement(parent, "Header")
     child1 = etree.SubElement(child, "Name")
     child1.text = self.name
     child2 = etree.SubElement(child, "Value")
     try:
         child2.text = CommonMethods.cdata_wrap(self.value)
     except ValueError as ve:
         pass
     except TypeError as te:
         pass
 def render(self):
     folder = etree.Element("Folder")
     name = etree.SubElement(folder, "Name")
     name.text = self.name
     if len(self.messages) > 0:
         for mes in self.messages:
             """
             :type mes: DmMessage
             """
             try:
                 mes.render(folder)
             except AttributeError as e:
                 pass
     outfile = open(CommonMethods.get_eaxs_filename(), "ab")
     etree.ElementTree(folder).write(outfile,
                                     encoding="utf-8",
                                     pretty_print=True)
     folder = None
Example #26
0
 def _fldr_render(self, path):
     fldr = Folder(self.current_relpath, path)
     fldr.messages = self.messages
     fldr.render()
     if CommonMethods.get_store_json():
         fh = open(os.path.join(self.json_write, fldr.name + ".json"),
                   'w',
                   encoding='utf-8')
         fh.write(',')
         jsn = fldr.render_json()
         json.dump(jsn, fh)
         fh.close()
     self.logger.info('Wrote folder of size {} bytes'.format(
         fldr.mbox_size))
     self.logger.info('Messages processed: {}'.format(
         self.total_messages_processed))
     fldr = None
     self.messages = []
     gc.collect()
Example #27
0
 def _process_plaintext_body(self):
     t = ""
     if isinstance(self.payload, Message):
         t = re.sub("\[\[", "\\[\\[", self.payload.get_payload())
         t = re.sub("]]", "\]\]", t)
     elif isinstance(self.payload, str):
         t = re.sub("\[\[", "\\[\\[", self.payload)
         t = re.sub("]]", "\]\]", t)
     s = sys.getsizeof(t)
     if s > (1024 ** 2):
         # This is probably not a plaintext payload. Punt to external body.
         self._full_ext_body()
         return
     try:
         sbint = IntBodyContent(CommonMethods.cdata_wrap(t), self.transfer_encoding, self.charset)
         if sbint.content == '' or sbint is None:
             self.payload = None
             return
         self.body_content.append(sbint)
     except ValueError as ve:
         self.logger.error("{}".format(ve))
     self.payload = None
 def _get_keywords(self):
     if isinstance(self.keywords, etree.CDATA):
         return CommonMethods.cdata_unwrap(self.keywords)
     return self.keywords
Example #29
0
 def _get_content_id(self):
     if self.content_id is not None:
         if isinstance(self.content_id, etree.CDATA):
             return CommonMethods.cdata_unwrap(self.content_id)
         return self.content_id
     return str()
    def __init__(self, rel_path, local_id, message, fn=None):
        """Constructor for Message"""
        self.logger = logging.getLogger("MessageType")
        self.message = message  # type: Message
        self.fn = fn
        # First parts of the schema message-type
        self.relative_path = rel_path  # type: str
        self.local_id = local_id
        self.message_id = CommonMethods.cdata_wrap(
            self.message.get("Message-ID"))  # type: str
        if self.message_id == '' or self.message_id is None:
            self.message_id = 'No Message-ID supplied'
        self.mime_version = CommonMethods.cdata_wrap(
            self.message.get("MIME-Version"))  # type: str
        self.incomplete = []  # type: list[IncompleteParse]

        # xm:message-headers
        xml_d = CommonMethods.tup_to_xml_date(
            CommonMethods.parsedate_tz(self.message.get("Date")))
        self.orig_date = xml_d  # type: str
        self.m_from = CommonMethods.cdata_wrap(
            self.message.get("From"))  # type: str
        self.sender = CommonMethods.cdata_wrap(
            self.message.get("Sender"))  # type: str
        try:
            self.m_to = CommonMethods.cdata_wrap(
                self.message.get("To"))  # type: str
        except TypeError as te:
            self.logger.error("{}".format(te))
            self.incomplete.append(
                IncompleteParse('TypeError parsing To Header', te))
        self.cc = CommonMethods.cdata_wrap(self.message.get("Cc"))  # type: str
        self.bcc = CommonMethods.cdata_wrap(
            self.message.get("Bcc"))  # type: str
        self.in_reply_to = CommonMethods.cdata_wrap(
            self.message.get("In-Reply-To"))
        self.references = CommonMethods.cdata_wrap(
            self.message.get("References"))  # type: str
        self.comments = CommonMethods.cdata_wrap(
            self.message.get("Comments"))  # type: str
        self.keywords = CommonMethods.cdata_wrap(
            self.message.get("Keywords"))  # type: str
        try:
            self.subject = CommonMethods.cdata_wrap(
                self.message.get("Subject"))  # type: str
        except TypeError as te:
            self.logger.error("{}".format(te))
            self.incomplete.append(
                IncompleteParse('TypeError parsing Subject line', te))
        try:
            self.status_flag = status.get(
                self.message.get("Status"))  # type: str
        except Exception as e:
            self.incomplete.append(
                IncompleteParse('TypeError parsing Status', e))

        self.headers = []  # type: list[Header]
        self.single_body = []  # type: list[SingleBody]
        self.multiple_body = []  # type: list[MultiBody]

        try:
            self.eol = CommonMethods.get_eol(
                self.message.as_string(policy=self.message.policy.clone(
                    utf8=True)))  # type: str
        except KeyError as e:
            self.logger.error("Inspect Message: KeyError {}".format(
                self.message.get("Message-ID")))
            self.incomplete.append(IncompleteParse('KeyError parsing EOL', e))
        except UnicodeEncodeError as ue:
            print(sys.gettrace())
            self.logger.error("Inspect Message: UnicodeEncodeError {}".format(
                self.message.get("Message-ID")))
            self.incomplete.append(
                IncompleteParse('UnicodeEncodeError parsing EOL', ue))
        except LookupError as le:
            self.logger.error("Inspect Message: LookupError {}".format(
                self.message.get("Message-ID")))
            self.incomplete.append(
                IncompleteParse('LookupError parsing EOL', le))
        except Exception as er:
            self.incomplete.append(
                IncompleteParse('LookupError parsing EOL', er))
        finally:
            self.eol = 'LF'

        self.hash = CommonMethods.get_hash(
            self.message.as_bytes())  # type: Hash

        self._process_headers()
        self._process_payload()