def main(): args = parse_arguments() set_logging(args.verbose, args.debug) msg_path = args.msg_path with extract_msg.openMsg(msg_path) as msg: attachments = None try: attachments = get_attachments(msg) except KeyError as _e: log.debug("Msg does not have attachments embedded. Likely you used a low quality eml -> msg converter for testing and it provided somewhat broken msg files. Or at least that's when this pops off the most for me.") if attachments is None: log.debug("No attachments found in msg.") else: log.debug("{0} attachments found in msg.".format(len(attachments))) raw_rtf = msg.rtfBody if args.extract_raw: if args.outfile: with open(args.outfile, 'wb') as fp: fp.write(raw_rtf) else: print(raw_rtf.decode()) else: rtf_obj = DeEncapsulator(raw_rtf.decode()) rtf_obj.deencapsulate() if rtf_obj.content_type == 'html': print(rtf_obj.html) else: print(rtf_obj.text)
def _extract_msg_objects(self, msg_obj: MsgObj): """Extracts email objects needed to construct an eml from a msg.""" original_eml_header = msg_obj._getStringStream('__substg1.0_007D') message = email.message_from_string(original_eml_header, policy=policy.default) body = {} if msg_obj.body is not None: body['text'] = { "obj": msg_obj.body, "subtype": 'plain', "charset": "utf-8", "cte": "base64" } if msg_obj.htmlBody is not None: try: _html_encoding_raw = msg_obj.mainProperties['3FDE0003'].value _html_encoding = codepage2codec(_html_encoding_raw) except KeyError: _html_encoding = msg_obj.stringEncoding body['html'] = { 'obj': msg_obj.htmlBody.decode(), "subtype": 'html', "charset": _html_encoding, "cte": "base64" } if msg_obj.rtfBody is not None: body['rtf'] = { "obj": msg_obj.rtfBody.decode(), "subtype": 'rtf', "charset": 'ascii', "cte": "base64" } try: rtf_obj = DeEncapsulator(msg_obj.rtfBody) rtf_obj.deencapsulate() if (rtf_obj.content_type == "html") and (msg_obj.htmlBody is None): self.encapsulated_body = 'text/html' body['html'] = { "obj": rtf_obj.html, "subtype": 'html', "charset": rtf_obj.text_codec, "cte": "base64" } elif (rtf_obj.content_type == "text") and (msg_obj.body is None): self.encapsulated_body = 'text/plain' body['text'] = { "obj": rtf_obj.plain_text, "subtype": 'plain', "charset": rtf_obj.text_codec } except NotEncapsulatedRtf: logger.debug("RTF body in Msg object is not encapsualted.") except MalformedEncapsulatedRtf: logger.info( "RTF body in Msg object contains encapsulated content, but it is malformed and can't be converted." ) attachments = msg_obj.attachments return message, body, attachments
def check_deencapsulate_validity(self, data, expect_error=None, name="test"): """Helper to check if a test input raises or doesn't raise an error.""" found_error = None try: output = DeEncapsulator(data) output.deencapsulate() except Exception as _e: found_error = _e if expect_error is not None: if found_error is None: self.fail( "Expected {} but DeEncapsulator finished without error on {}." .format(expect_error, name)) if not isinstance(found_error, expect_error): self.fail( 'Unexpected error {} from DeEncapsulator for {}.'.format( found_error, name)) else: if found_error is not None: self.fail( 'Wrong kind of error {} from DeEncapsulator for {}, expected {}.' .format(type(found_error), name, expect_error))
def test_extracted_correct_from_header(self): """ - correctly extract the header type - with multiple header vals (one in header and one string in body of) """ template_data = join(DATA_BASE_DIR, "rtf_parsing", "from_header_template.rtf") rtf = self.replace_from_header(template_data, "\\fromhtml1") output = DeEncapsulator(rtf) output.deencapsulate() self.assertEqual('html', output.get_content_type()) rtf = self.replace_from_header(template_data, "\\fromtext") output = DeEncapsulator(rtf) output.deencapsulate() self.assertEqual('text', output.get_content_type()) # Try with them back to back. First should win. rtf = self.replace_from_header(template_data, "\\fromtext\\fromhtml1") self.check_deencapsulate_validity( rtf, expect_error=MalformedEncapsulatedRtf, name="multiple FROM headers means malformed") rtf = self.replace_from_header(template_data, "\\fromhtml1\\fromtext") self.check_deencapsulate_validity( rtf, expect_error=MalformedEncapsulatedRtf, name="multiple FROM headers means malformed")
def test_japanese_encoded_text(self): """ """ rtf_path = join(DATA_BASE_DIR, "plain_text", "japanese_iso_2022.rtf") original_body = "すみません。" with open(rtf_path, 'r') as fp: raw_rtf = fp.read() rtf_obj = DeEncapsulator(raw_rtf) rtf_obj.deencapsulate() output_text = self.clean_newlines(rtf_obj.text) self.assertEqual(output_text, original_body)
def test_u_encoded_html(self): "Tests that de-encapsulation on u encoded encoded HTML works." rtf_path = join(DATA_BASE_DIR, "html", "multiple-encodings.rtf") txt_path = join(DATA_BASE_DIR, "html", "multiple-encodings.txt") with open(txt_path, 'r') as fp: raw_text = fp.read() original_text = self.clean_whitespace(raw_text) with open(rtf_path, 'r') as fp: raw_rtf = fp.read() rtf_obj = DeEncapsulator(raw_rtf) rtf_obj.deencapsulate() output_text = self.clean_whitespace(rtf_obj.html) self.compare_html(original_text, output_text)
def test_quoted_printable(self): """Test that encoded text in an original quoted printable message is still quoted when de-encapsulated. This test checks that it is STILL NOT IMPLEMENTED. So, if you fix it this test will expose that and we will need to change the test.""" quote_printable_rtf_path = join(DATA_BASE_DIR, "plain_text", "quoted_printable_01.rtf") quote_printable_txt_path = join(DATA_BASE_DIR, "plain_text", "quoted_printable_01.txt") # quote_printable_eml_path = join(DATA_BASE_DIR, "plain_text", "quoted_printable_01.eml") # quote_printable_msg_path = join(DATA_BASE_DIR, "plain_text", "quoted_printable_01.msg") with open(quote_printable_txt_path, 'r') as fp: raw_text = fp.read() original_decoded_text = self.clean_newlines(raw_text) with open(quote_printable_rtf_path, 'r') as fp: raw_rtf = fp.read() rtf_obj = DeEncapsulator(raw_rtf) rtf_obj.deencapsulate() output_text = self.clean_newlines(rtf_obj.text) self.assertNotEqual(original_decoded_text, output_text)
def test_decoded_quoted_printable(self): """Test that decoded text in an original quoted printable message is still quoted when de-encapsulated.""" quote_printable_rtf_path = join(DATA_BASE_DIR, "plain_text", "quoted_printable_01.rtf") quote_printable_txt_path = join(DATA_BASE_DIR, "plain_text", "quoted_printable_01.txt") # quote_printable_eml_path = join(DATA_BASE_DIR, "plain_text", "quoted_printable_01.eml") # quote_printable_msg_path = join(DATA_BASE_DIR, "plain_text", "quoted_printable_01.msg") charset = "cp1251" with open(quote_printable_txt_path, 'r') as fp: raw_text = fp.read() original_decoded_text = quopri.decodestring(raw_text) original_decoded_text = original_decoded_text.decode(charset) original_decoded_text = self.clean_newlines(original_decoded_text) with open(quote_printable_rtf_path, 'r') as fp: raw_rtf = fp.read() rtf_obj = DeEncapsulator(raw_rtf) rtf_obj.deencapsulate() output_text = self.clean_newlines(rtf_obj.text) self.assertEqual(original_decoded_text, output_text)
def run_parsing(self, rtf): output = DeEncapsulator(rtf) output.stripped_rtf = output._strip_htmlrtf_sections() output.simplified_rtf = output._simplify_text_for_parsing() output.doc_tree = output._parse_rtf() return output