Beispiel #1
0
    def test_extracted_correct_from_header(self):
        """
        - correctly extract the header type
        - with multiple header vals (one in header and one string in body of)
        """
        template_data = join(DATA_BASE_DIR, "rtf_parsing",
                             "from_header_template.rtf")
        rtf = self.replace_from_header(template_data, "\\fromhtml1")
        output = DeEncapsulator(rtf)
        output.deencapsulate()
        self.assertEqual('html', output.get_content_type())

        rtf = self.replace_from_header(template_data, "\\fromtext")
        output = DeEncapsulator(rtf)
        output.deencapsulate()
        self.assertEqual('text', output.get_content_type())

        # Try with them back to back. First should win.
        rtf = self.replace_from_header(template_data, "\\fromtext\\fromhtml1")
        self.check_deencapsulate_validity(
            rtf,
            expect_error=MalformedEncapsulatedRtf,
            name="multiple FROM headers means malformed")

        rtf = self.replace_from_header(template_data, "\\fromhtml1\\fromtext")
        self.check_deencapsulate_validity(
            rtf,
            expect_error=MalformedEncapsulatedRtf,
            name="multiple FROM headers means malformed")
Beispiel #2
0
 def _extract_msg_objects(self, msg_obj: MsgObj):
     """Extracts email objects needed to construct an eml from a msg."""
     original_eml_header = msg_obj._getStringStream('__substg1.0_007D')
     message = email.message_from_string(original_eml_header,
                                         policy=policy.default)
     body = {}
     if msg_obj.body is not None:
         body['text'] = {
             "obj": msg_obj.body,
             "subtype": 'plain',
             "charset": "utf-8",
             "cte": "base64"
         }
     if msg_obj.htmlBody is not None:
         try:
             _html_encoding_raw = msg_obj.mainProperties['3FDE0003'].value
             _html_encoding = codepage2codec(_html_encoding_raw)
         except KeyError:
             _html_encoding = msg_obj.stringEncoding
         body['html'] = {
             'obj': msg_obj.htmlBody.decode(),
             "subtype": 'html',
             "charset": _html_encoding,
             "cte": "base64"
         }
     if msg_obj.rtfBody is not None:
         body['rtf'] = {
             "obj": msg_obj.rtfBody.decode(),
             "subtype": 'rtf',
             "charset": 'ascii',
             "cte": "base64"
         }
         try:
             rtf_obj = DeEncapsulator(msg_obj.rtfBody)
             rtf_obj.deencapsulate()
             if (rtf_obj.content_type
                     == "html") and (msg_obj.htmlBody is None):
                 self.encapsulated_body = 'text/html'
                 body['html'] = {
                     "obj": rtf_obj.html,
                     "subtype": 'html',
                     "charset": rtf_obj.text_codec,
                     "cte": "base64"
                 }
             elif (rtf_obj.content_type
                   == "text") and (msg_obj.body is None):
                 self.encapsulated_body = 'text/plain'
                 body['text'] = {
                     "obj": rtf_obj.plain_text,
                     "subtype": 'plain',
                     "charset": rtf_obj.text_codec
                 }
         except NotEncapsulatedRtf:
             logger.debug("RTF body in Msg object is not encapsualted.")
         except MalformedEncapsulatedRtf:
             logger.info(
                 "RTF body in Msg object contains encapsulated content, but it is malformed and can't be converted."
             )
     attachments = msg_obj.attachments
     return message, body, attachments
Beispiel #3
0
def main():
    args = parse_arguments()
    set_logging(args.verbose, args.debug)
    msg_path = args.msg_path
    with extract_msg.openMsg(msg_path) as msg:
        attachments = None
        try:
            attachments = get_attachments(msg)
        except KeyError as _e:
            log.debug("Msg does not have attachments embedded. Likely you used a low quality eml -> msg converter for testing and it provided somewhat broken msg files. Or at least that's when this pops off the most for me.")
        if attachments is None:
            log.debug("No attachments found in msg.")
        else:
            log.debug("{0} attachments found in msg.".format(len(attachments)))
        raw_rtf = msg.rtfBody
        if args.extract_raw:
            if args.outfile:
                with open(args.outfile, 'wb') as fp:
                    fp.write(raw_rtf)
            else:
                print(raw_rtf.decode())
        else:
            rtf_obj = DeEncapsulator(raw_rtf.decode())
            rtf_obj.deencapsulate()
            if rtf_obj.content_type == 'html':
                print(rtf_obj.html)
            else:
                print(rtf_obj.text)
    def check_deencapsulate_validity(self,
                                     data,
                                     expect_error=None,
                                     name="test"):
        """Helper to check if a test input raises or doesn't raise an error."""
        found_error = None
        try:
            output = DeEncapsulator(data)
            output.deencapsulate()
        except Exception as _e:
            found_error = _e

        if expect_error is not None:
            if found_error is None:
                self.fail(
                    "Expected {} but DeEncapsulator finished without error on {}."
                    .format(expect_error, name))
            if not isinstance(found_error, expect_error):
                self.fail(
                    'Unexpected error {} from DeEncapsulator for {}.'.format(
                        found_error, name))
        else:
            if found_error is not None:
                self.fail(
                    'Wrong kind of error {} from DeEncapsulator for {}, expected {}.'
                    .format(type(found_error), name, expect_error))
Beispiel #5
0
 def test_japanese_encoded_text(self):
     """ """
     rtf_path = join(DATA_BASE_DIR, "plain_text", "japanese_iso_2022.rtf")
     original_body = "すみません。"
     with open(rtf_path, 'r') as fp:
         raw_rtf = fp.read()
         rtf_obj = DeEncapsulator(raw_rtf)
         rtf_obj.deencapsulate()
         output_text = self.clean_newlines(rtf_obj.text)
     self.assertEqual(output_text, original_body)
Beispiel #6
0
 def test_u_encoded_html(self):
     "Tests that de-encapsulation on u encoded encoded HTML works."
     rtf_path = join(DATA_BASE_DIR, "html", "multiple-encodings.rtf")
     txt_path = join(DATA_BASE_DIR, "html", "multiple-encodings.txt")
     with open(txt_path, 'r') as fp:
         raw_text = fp.read()
         original_text = self.clean_whitespace(raw_text)
     with open(rtf_path, 'r') as fp:
         raw_rtf = fp.read()
         rtf_obj = DeEncapsulator(raw_rtf)
         rtf_obj.deencapsulate()
         output_text = self.clean_whitespace(rtf_obj.html)
     self.compare_html(original_text, output_text)
Beispiel #7
0
    def test_quoted_printable(self):
        """Test that encoded text in an original quoted printable message is still quoted when de-encapsulated.

        This test checks that it is STILL NOT IMPLEMENTED. So, if you fix it this test will expose that and we will need to change the test."""
        quote_printable_rtf_path = join(DATA_BASE_DIR, "plain_text", "quoted_printable_01.rtf")
        quote_printable_txt_path = join(DATA_BASE_DIR, "plain_text", "quoted_printable_01.txt")
        # quote_printable_eml_path = join(DATA_BASE_DIR, "plain_text", "quoted_printable_01.eml")
        # quote_printable_msg_path = join(DATA_BASE_DIR, "plain_text", "quoted_printable_01.msg")
        with open(quote_printable_txt_path, 'r') as fp:
            raw_text = fp.read()
            original_decoded_text = self.clean_newlines(raw_text)
        with open(quote_printable_rtf_path, 'r') as fp:
            raw_rtf = fp.read()
            rtf_obj = DeEncapsulator(raw_rtf)
            rtf_obj.deencapsulate()
            output_text = self.clean_newlines(rtf_obj.text)
        self.assertNotEqual(original_decoded_text, output_text)
Beispiel #8
0
 def test_decoded_quoted_printable(self):
     """Test that decoded text in an original quoted printable message is still quoted when de-encapsulated."""
     quote_printable_rtf_path = join(DATA_BASE_DIR, "plain_text", "quoted_printable_01.rtf")
     quote_printable_txt_path = join(DATA_BASE_DIR, "plain_text", "quoted_printable_01.txt")
     # quote_printable_eml_path = join(DATA_BASE_DIR, "plain_text", "quoted_printable_01.eml")
     # quote_printable_msg_path = join(DATA_BASE_DIR, "plain_text", "quoted_printable_01.msg")
     charset = "cp1251"
     with open(quote_printable_txt_path, 'r') as fp:
         raw_text = fp.read()
         original_decoded_text = quopri.decodestring(raw_text)
         original_decoded_text = original_decoded_text.decode(charset)
         original_decoded_text = self.clean_newlines(original_decoded_text)
     with open(quote_printable_rtf_path, 'r') as fp:
         raw_rtf = fp.read()
         rtf_obj = DeEncapsulator(raw_rtf)
         rtf_obj.deencapsulate()
         output_text = self.clean_newlines(rtf_obj.text)
     self.assertEqual(original_decoded_text, output_text)
Beispiel #9
0
 def run_parsing(self, rtf):
     output = DeEncapsulator(rtf)
     output.stripped_rtf = output._strip_htmlrtf_sections()
     output.simplified_rtf = output._simplify_text_for_parsing()
     output.doc_tree = output._parse_rtf()
     return output