def test_policy_unknown(self): shutil.copy('./tests/data/embedded.docx', './tests/data/clean.docx') p = office.MSOfficeParser('./tests/data/clean.docx') with self.assertRaises(ValueError): p.unknown_member_policy = UnknownMemberPolicy( 'unknown_policy_name_totally_invalid') os.remove('./tests/data/clean.docx')
def test_office(self): shutil.copy('./tests/data/office_revision_session_ids.docx', './tests/data/clean.docx') p = office.MSOfficeParser('./tests/data/clean.docx') meta = p.get_meta() self.assertIsNotNone(meta) how_many_rsid = False with zipfile.ZipFile('./tests/data/clean.docx') as zin: for item in zin.infolist(): if not item.filename.endswith('.xml'): continue num = zin.read(item).decode('utf-8').lower().count('w:rsid') how_many_rsid += num self.assertEqual(how_many_rsid, 11) ret = p.remove_all() self.assertTrue(ret) with zipfile.ZipFile('./tests/data/clean.cleaned.docx') as zin: for item in zin.infolist(): if not item.filename.endswith('.xml'): continue num = zin.read(item).decode('utf-8').lower().count('w:rsid') self.assertEqual(num, 0) os.remove('./tests/data/clean.docx') os.remove('./tests/data/clean.cleaned.docx')
def test_office_incomplete(self): shutil.copy('./tests/data/malformed_content_types.docx', './tests/data/clean.docx') p = office.MSOfficeParser('./tests/data/clean.docx') self.assertIsNotNone(p) self.assertFalse(p.remove_all()) os.remove('./tests/data/clean.docx')
def test_policy_keep(self): shutil.copy('./tests/data/embedded.docx', './tests/data/clean.docx') p = office.MSOfficeParser('./tests/data/clean.docx') p.unknown_member_policy = UnknownMemberPolicy.KEEP self.assertTrue(p.remove_all()) os.remove('./tests/data/clean.docx') os.remove('./tests/data/clean.cleaned.docx')
def test_policy_keep(self): shutil.copy('./tests/data/embedded.docx', self.target) p = office.MSOfficeParser(self.target) p.unknown_member_policy = UnknownMemberPolicy.KEEP self.assertTrue(p.remove_all()) os.remove(p.filename) os.remove(p.output_filename)
def test_office(self): shutil.copy('./tests/data/dirty.docx', './tests/data/clean.docx') p = office.MSOfficeParser('./tests/data/clean.docx') meta = p.get_meta() self.assertIsNotNone(meta) ret = p.remove_all() self.assertTrue(ret) p = office.MSOfficeParser('./tests/data/clean.cleaned.docx') self.assertEqual(p.get_meta(), {}) self.assertTrue(p.remove_all()) os.remove('./tests/data/clean.docx') os.remove('./tests/data/clean.cleaned.docx') os.remove('./tests/data/clean.cleaned.cleaned.docx')
def test_complex_pptx(self): target = './tests/data/clean.pptx' shutil.copy('./tests/data/narrated_powerpoint_presentation.pptx', target) p = office.MSOfficeParser(target) self.assertTrue(p.remove_all()) os.remove(target) os.remove(p.output_filename)
def test_office(self): shutil.copy('./tests/data/dirty.docx', './tests/data/clean.docx') p = office.MSOfficeParser('./tests/data/clean.docx') meta = p.get_meta() self.assertIsNotNone(meta) self.assertEqual(meta['word/media/image1.png']['Comment'], 'This is a comment, be careful!') ret = p.remove_all() self.assertTrue(ret) p = office.MSOfficeParser('./tests/data/clean.cleaned.docx') self.assertEqual(p.get_meta(), {}) self.__check_zip_meta(p) self.__check_deep_meta(p) os.remove('./tests/data/clean.docx') os.remove('./tests/data/clean.cleaned.docx')
def test_docx(self): p = office.MSOfficeParser('./tests/data/dirty.docx') meta = p.get_meta() self.assertEqual(meta['docProps/core.xml']['cp:lastModifiedBy'], 'Julien Voisin') self.assertEqual(meta['docProps/core.xml']['dc:creator'], 'julien voisin') self.assertEqual( meta['docProps/app.xml']['Application'], 'LibreOffice/5.4.5.1$Linux_X86_64 LibreOffice_project/40m0$Build-1' )
def test_msoffice(self): with zipfile.ZipFile('./tests/data/revision.docx') as zipin: c = zipin.open('word/document.xml') content = c.read() r = b'<w:ins w:id="1" w:author="Unknown Author" w:date="2018-06-28T23:48:00Z">' self.assertIn(r, content) shutil.copy('./tests/data/revision.docx', './tests/data/revision_clean.docx') p = office.MSOfficeParser('./tests/data/revision_clean.docx') self.assertTrue(p.remove_all()) with zipfile.ZipFile( './tests/data/revision_clean.cleaned.docx') as zipin: c = zipin.open('word/document.xml') content = c.read() r = b'<w:ins w:id="1" w:author="Unknown Author" w:date="2018-06-28T23:48:00Z">' self.assertNotIn(r, content) os.remove('./tests/data/revision_clean.docx') os.remove('./tests/data/revision_clean.cleaned.docx')
def test_docx_with_py(self): shutil.copy('./tests/data/embedded.docx', './tests/data/clean.docx') p = office.MSOfficeParser('./tests/data/clean.docx') self.assertFalse(p.remove_all()) os.remove('./tests/data/clean.docx')
def test_office_broken(self): shutil.copy('./tests/data/broken_xml_content_types.docx', './tests/data/clean.docx') with self.assertRaises(ValueError): office.MSOfficeParser('./tests/data/clean.docx') os.remove('./tests/data/clean.docx')
def test_docx(self): shutil.copy('./tests/data/dirty.png', './tests/data/clean.docx') with self.assertRaises(ValueError): office.MSOfficeParser('./tests/data/clean.docx') os.remove('./tests/data/clean.docx')
def test_office_incomplete(self): shutil.copy('./tests/data/malformed_content_types.docx', './tests/data/clean.docx') with self.assertRaises(ValueError): office.MSOfficeParser('./tests/data/clean.docx') os.remove('./tests/data/clean.docx')