def testAssertions(self): # TQA-02.4 with self.assertRaises(KeyError): Document('testdocs/bad_extension.doc') self.assertEqual(self.test_stdout.getvalue(), 'File extension is not an OOXML file type') # Skip this test: The document doesn't follow the spec, but is still openable # with self.assertRaisesRegexp(AssertionError, # 'content_type of Part is empty'): # Document('testdocs/missing_content_type.docx') with self.assertRaises(KeyError): Document('testdocs/missing_part.docx') self.assertEqual( self.test_stdout.getvalue(), 'target_path is not a valid Part: /word/endnotes.xml') with self.assertRaises(KeyError): Document('testdocs/missing_rel_target.docx') self.assertEqual(self.test_stdout.getvalue(), 'target_path is not a valid Part: /') with self.assertRaises(etree.XMLSyntaxError): Document('testdocs/corrupt_xml.docx') self.assertEqual( self.test_stdout.getvalue(), 'part cannot be parsed successfully: Part [/[Content_Types].xml]' )
def testFileName(self): doc1 = Document('testdocs/test.docx') self.assertTrue(doc1.type == 'Word') doc2 = Document('testdocs/test.xlsx') self.assertTrue(doc2.type == 'Excel') doc3 = Document('testdocs/test.pptx') self.assertTrue(doc3.type == 'PowerPoint')
def testRelationships(self): doc1 = Document('testdocs/test.docx') self.assertEqual(len(doc1.relationships), 13) reltype = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument' self.assertEqual(doc1.relationships_dict[reltype][0].target, 'word/document.xml') self.assertEqual( doc1.find_relationships_by_type('metadata/core-properties') [0].source.name, 'RootPart') self.assertEqual( doc1.find_relationships_by_type('metadata/core-properties') [0].source.content_type(), '(virtual root part)') self.assertEqual( doc1.find_relationships_by_type('metadata/core-properties')[0].id, 'rId2') self.assertEqual( doc1.find_relationships_by_type('metadata/core-properties') [0].target, 'docProps/core.xml') self.assertEqual( doc1.find_relationships_by_type('metadata/core-properties') [0].target_part.name, '/docProps/core.xml') self.assertEqual( doc1.find_relationships_by_type( 'metadata/core-properties')[0].type, 'http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties' ) self.assertEqual( doc1.find_relationships_by_type('metadata/core-properties') [0].is_external, False) doc2 = Document('testdocs/216688.docx') self.assertEqual( doc2.find_relationships_by_type('hyperlink')[0].is_external, True) self.assertEqual( doc1.part_by_name['/word/document.xml'].relationships_out()[2].id, 'rId7') self.assertEqual( doc1.part_by_name['/docProps/app.xml'].relationships_out(), []) self.assertEqual( doc1.part_by_name['/docProps/app.xml'].relationships_in()[0].id, 'rId3') self.assertEqual(doc1.root_part.relationships_out()[1].id, 'rId2') self.assertEqual( doc1.parts_by_relationship_type('extended-properties')[0].name, '/docProps/app.xml') self.assertEqual( doc1.parts_by_relationship_type('ships/extended-properties') [0].name, '/docProps/app.xml') self.assertEqual(doc1.main_part().name, '/word/document.xml')
def testBugs(self): # Regression test for BUG OXPA-83 # Make sure Target_Part='NULL', in this case a Relationship with # Type 'image', is handled properly doc1 = Document('testdocs/037027.pptx') for image in doc1.features.images: test = image.name
def testFeatures(self): doc1 = Document('testdocs/content.docx') self.assertEqual(doc1.features.custom_properties, []) self.assertEqual(len(doc1.features.images), 14) self.assertEqual([ i.content_type() for i in doc1.features.images if i.name == '/word/media/image1.png' ], ['image/png']) self.assertEqual( sorted(doc1.features.images, key=lambda part: part.name)[0].content_type(), 'image/png') self.assertEqual(len(doc1.features.videos), 0) self.assertEqual(len(doc1.features.fonts), 2) self.assertEqual( sorted(doc1.features.fonts, key=lambda part: part.name)[0].name, '/word/fonts/font1.odttf') doc2 = Document('testdocs/sounds.pptx') self.assertEqual( sorted(doc2.features.sounds, key=lambda part: part.name)[0].name, '/ppt/media/audio1.wav') self.assertEqual( sorted(doc2.features.sounds, key=lambda part: part.name)[0].content_type(), 'audio/wav') doc3 = Document('testdocs/macros.xlsm') self.assertEqual( sorted(doc3.features.macros, key=lambda part: part.name)[0].name, '/xl/vbaProject.bin') self.assertEqual( sorted(doc3.features.embedded_controls, key=lambda part: part.name)[0].name, '/xl/activeX/activeX1.xml') doc4 = Document('testdocs/content2.docx') self.assertEqual( sorted(doc4.features.embedded_packages, key=lambda part: part.name)[2].name, '/word/embeddings/Microsoft_Excel-Arbeitsblatt3.xlsx') self.assertEqual(len(doc1.features.embedded_objects), 10) self.assertEqual( sorted(doc1.features.embedded_objects, key=lambda part: part.name)[2].name, '/word/embeddings/Microsoft_Office_PowerPoint_97-2003_Presentation7.ppt' )
def testPartCollection(self): doc1 = Document('testdocs/test.docx') self.assertEquals(doc1.parts[0].name, '/[Content_Types].xml') self.assertEquals(doc1.parts[0].stream().read(10), b'\x3C\x3F\x78\x6D\x6C\x20\x76\x65\x72\x73') self.assertEquals(doc1.parts[2].name, '/word/_rels/document.xml.rels') self.assertEquals(doc1.part_by_name['/[Content_Types].xml'].name, '/[Content_Types].xml') self.assertEquals( doc1.part_by_name['/[Content_Types].xml'].stream().read(10), b'\x3C\x3F\x78\x6D\x6C\x20\x76\x65\x72\x73')
def testContentTypes(self): doc1 = Document('testdocs/test.docx') part1 = doc1.part_by_name['/word/document.xml'] self.assertEquals( part1.content_type(), 'application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml' ) part2 = Document( 'testdocs/test.docx').part_by_name['/customXml/item1.xml'] self.assertEquals(part2.content_type(), 'application/xml') self.assertEqual( doc1.parts_by_content_type( 'application/vnd.ms-word.stylesWithEffects+xml')[0].name, '/word/stylesWithEffects.xml') self.assertEqual( doc1.parts_by_content_type_regex('footnotes')[0].name, '/word/footnotes.xml') self.assertEqual( doc1.parts_by_content_type_regex('properties')[1].name, '/docProps/app.xml')
def testZipfileProperties(self): doc1 = Document('testdocs/test.docx') self.assertEquals(doc1.zip().namelist()[0], '[Content_Types].xml') self.assertEquals(doc1.zip().comment, '') self.assertEquals( doc1.zip().part_extract('[Content_Types].xml').read(10), b'\x3C\x3F\x78\x6D\x6C\x20\x76\x65\x72\x73') self.assertEquals(len(doc1.zip().namelist()), 17) doc2 = Document('testdocs/testzipattrib.docx') self.assertEquals( doc2.zip().part_info('[Content_Types].xml').file_size, 1818) self.assertEquals( doc2.zip().part_info('[Content_Types].xml').compress_size, 406) self.assertEquals( doc2.zip().part_info('[Content_Types].xml').date_time, (2013, 07, 03, 15, 22, 12)) self.assertEquals(doc2.zip().part_info('[Content_Types].xml').comment, '') with self.assertRaises(ZipCRCError): Document('testdocs/badcrc.docx').zip()
def testCoreProperties(self): doc1 = Document('testdocs/test.docx') self.assertEqual(doc1.core_properties.name, '/docProps/core.xml') self.assertEqual(doc1.core_properties.category, 'Auxiliary') self.assertEqual(doc1.core_properties.content_status, '') self.assertEqual(doc1.core_properties.created, '2010-10-21T08:54:00Z') self.assertEqual(doc1.core_properties.creator, 'Klaus-Peter Eckert') self.assertEqual(doc1.core_properties.description, 'Footnotes and endnotes in different sections') self.assertEqual(doc1.core_properties.identifier, '') self.assertEqual(doc1.core_properties.keywords, 'rainbow, color, colour, couleur') self.assertEqual(doc1.core_properties.language, '') self.assertEqual(doc1.core_properties.last_modified_by, 'Klaus-Peter Eckert') self.assertEqual(doc1.core_properties.last_printed, '') self.assertEqual(doc1.core_properties.modified, '2010-10-21T09:05:00Z') self.assertEqual(doc1.core_properties.revision, '4') self.assertEqual(doc1.core_properties.subject, '') self.assertEqual(doc1.core_properties.title, '') self.assertEqual(doc1.core_properties.version, '') doc2 = Document('testdocs/no_core_props.docx') self.assertEqual(doc2.core_properties.name, '')
def testExportJSON(self): doc1 = Document('testdocs/test.docx') self.assertEqual( doc1.part_by_name['/word/document.xml'].to_reference(), 'Part [/word/document.xml]') self.assertEqual( doc1.part_by_name['/word/document.xml'].relationships_out() [2].to_reference(), 'Relationship [rId7] (source Part [/word/document.xml])') self.assertEqual( doc1.part_by_name['/word/document.xml'].to_json()[0:30], '{\n "content-type": "applica') self.assertEqual(doc1.relationships[0].to_json()[0:32], '{\n "source": "Part [RootPart]') self.assertEqual(doc1.to_json()[0:20], '{\n "document": [\n') self.assertEqual( doc1.to_json(include_stream=True)[285:325], ' "stream_b64": "PD94bWwgdmVyc2lvbj0i')
def testPartXML(self): part1 = Document( 'testdocs/test.docx').part_by_name['/word/document.xml'] self.assertEquals( list(part1.xml().getroot().iterchildren())[0].tag, '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}body' ) self.assertEquals( part1.xpath('//@w:val', part1.xml().getroot().nsmap)[2], 'Funotenzeichen') part2 = Document( 'testdocs/test.docx').part_by_name['/[Content_Types].xml'] xmlns = { 'ct': 'http://schemas.openxmlformats.org/package/2006/content-types' } self.assertEqual( part2.xpath('/ct:Types/ct:Override/@PartName', xmlns)[0], '/word/document.xml') part3 = Document( 'testdocs/testutf16.docx').part_by_name['/word/document.xml'] self.assertEquals( part3.xml().xpath( '//*[local-name() = "lang"]/@*[local-name() = "val"]')[0], 'en-US') part4 = Document( 'testdocs/testascii.docx').part_by_name['/word/document.xml'] self.assertEquals( part4.xml().xpath( '//*[local-name() = "lang"]/@*[local-name() = "val"]')[0], 'en-US') doc5 = Document('testdocs/macros-non-standard.xlsm') self.assertEqual(doc5.features.macros[0].name, '/xl/new_name.bin') part6 = Document('testdocs/non-standard-namespace.docx' ).part_by_name['/word/document.xml'] self.assertEquals( part1.xpath('//@fake:val', part6.xml().getroot().nsmap)[2], 'Funotenzeichen')
def testDenialOfService(self): doc = Document('testdocs/dos.docx')
def testPseudoFile(self): with open("testdocs/macros.xlsm", 'rb') as f: pf = BytesIO(f.read()) Document(pseudofile=pf, filename='macros.xlsm')
def testPart(self): part1 = Part(Document('testdocs/test.docx'), '/[Content_Types].xml') self.assertEquals(part1.name, '/[Content_Types].xml') self.assertEquals(part1.stream().read(10), b'\x3C\x3F\x78\x6D\x6C\x20\x76\x65\x72\x73')
def testDocOpen(self): cur_dir = os.path.dirname(os.path.realpath(__file__)) corpus_path = [ os.path.join(cur_dir, 'govdocs'), os.path.join(cur_dir, 'fraunhoferlibrary') ] files = [] for dir_ in corpus_path: for f in os.listdir(dir_): file_ = os.path.join(dir_, f) if os.path.isfile(file_): files.append(file_) log = codecs.open('smoke_tests.log', 'w', encoding='utf-8', errors="surrogateescape") errorlog = codecs.open('smoke_tests_error.log', 'w', encoding='utf-8', errors="surrogateescape") # Write error log header errorlog.write( 'Three files from govdocs (govdocs/641559.docx, govdocs/500968.xlsx, and\n' 'govdocs/974690.xlsx) are apparently corrupt and do not open in Microsoft\n' 'Office 2010; hence, error messages should appear for them. (Note that govdocs\n' 'is a random sample and explicitly includes bad or corrupt files.)\n' 'No other error messages should appear here.\n\n\n') for docfile in files: if (sys.version_info < (3, 0)): to_print = docfile.decode('utf8', 'replace') else: to_print = docfile print('\nTesting %s...' % to_print) log.write('\nTesting %s...\n' % to_print) try: doc1 = Document(docfile) except ZipCRCError: msg = 'Error: Bad CRC for file: %s\n' % to_print print(msg) log.write(msg) errorlog.write(msg) continue except Exception as e: msg = 'Error: File: %s: %s - %s\n' % ( to_print, sys.exc_info()[0].__name__, e) print(msg) log.write(msg) errorlog.write(msg) continue log.write(' Document type is: %s\n' % doc1.type) log.write(' Document is macro enabled: %s\n' % doc1.is_macro_enabled) log.write(' Document is a template: %s\n' % doc1.is_template) print(' Testing zip.part_info method...') log.write(' Testing zip.part_info method...\n') log.write( ' zip.part_info([Content_Types].xml).file_size: %s\n' % doc1.zip().part_info('[Content_Types].xml').file_size) log.write( ' zip.part_info([Content_Types].xml).compress_size: %s\n' % doc1.zip().part_info('[Content_Types].xml').compress_size) print(' Done.') log.write(' Done.\n') second_part = doc1.parts[1] print(' Testing Part: %s' % second_part.name) log.write(' Testing Part: %s\n' % second_part.name) doc_stream = doc1.part_by_name[second_part.name].stream().read(10) print(' Part stream successfully captured.') log.write(' Part stream successfully captured.\n') partxml = doc1.part_by_name[second_part.name].xml() print(' Part XML successfully parsed.') log.write(' Part XML successfully parsed.\n') print(' Checking doc.xpath method...') log.write(' Checking doc.xpath method...\n') log.write(' XPath Result: %s\n' % doc1.part_by_name['/[Content_Types].xml'].xpath( '*/@ContentType')[0]) print(' Done.') log.write(' Done.\n') print(' Checking that all Parts can get Content_Type...') log.write(' Checking that all Parts can get Content_Type...\n') for part in doc1.parts: ct = part.content_type() log.write(' Part %s is Content_Type: %s\n' % (part.name, ct)) print(' Done.') log.write(' Done.\n') print(' Checking that Document has main_part...') log.write(' Checking that Document has main_part...\n') doc_main = doc1.main_part() log.write(' Main Part: %s\n' % doc_main.name) print(' Done.') log.write(' Done.\n') print( ' Checking all source and target Relationships for each part...' ) log.write( ' Checking all source and target Relationships for each part...\n' ) for part in doc1.parts: rel_in = part.relationships_in() rel_out = part.relationships_out() log.write(' Part %s: Relationships in source name: %s\n' % (part.name, [r.source.name for r in rel_in])) log.write(' Part %s: Relationships out: %s\n' % (part.name, [r.target for r in rel_out])) print(' Done.') log.write(' Done.\n') print( ' Testing Document methods to find by Part or Relationship...' ) log.write( ' Testing Document methods to find by Part or Relationship...\n' ) log.write(' doc.parts_by_content_type(application/xml): %s\n' % doc1.parts_by_content_type('application/xml')[0]) log.write(' doc.parts_by_content_type_regex(ation/xm): %s\n' % doc1.parts_by_content_type_regex('ation/xm')[0]) log.write( ' doc.parts_by_relationship_type(/relationships/officeDocument: %s\n' % doc1.parts_by_relationship_type( '/relationships/officeDocument')[0].name) log.write( ' doc.find_relationship_by_type(/relationships/officeDocument).source: %s\n' % doc1.find_relationships_by_type( '/relationships/officeDocument')[0].source) print(' Done.') log.write(' Done.\n') print(' Checking for all Features...') log.write(' Checking for Features...\n') for image in doc1.features.images: log.write(' Image: %s\n' % image.name) for video in doc1.features.videos: log.write(' Video: %s\n' % video.name) for sound in doc1.features.sounds: log.write(' Sound: %s\n' % sound.name) for font in doc1.features.fonts: log.write(' Font: %s\n' % font.name) for macro in doc1.features.macros: log.write(' Macro content: %s\n' % macro.name) for comment in doc1.features.comments: log.write(' Comment content: %s\n' % comment.name) for customX in doc1.features.custom_xml: log.write(' Custom XML content: %s\n' % customX.name) for embedded_control in doc1.features.embedded_controls: log.write(' Embedded Control content: %s\n' % embedded_control.name) for embedded_object in doc1.features.embedded_objects: log.write(' Embedded Object content: %s\n' % embedded_object.name) for embedded_package in doc1.features.embedded_packages: log.write(' Embedded Package content: %s\n' % embedded_package.name) for digital_signature in doc1.features.digital_signatures: log.write(' Digital Signature content: %s\n' % digital_signature.name) print(' Done.') log.write(' Done.\n') print(' Checking Core Properties...') log.write(' Checking Core Properties...\n') log.write(' Category: %s\n' % doc1.core_properties.category) log.write(' Content status: %s\n' % doc1.core_properties.content_status) log.write(' Created: %s\n' % doc1.core_properties.created) log.write(' Creator: %s\n' % doc1.core_properties.creator) log.write(' Description: %s\n' % doc1.core_properties.description) log.write(' Identifier: %s\n' % doc1.core_properties.identifier) log.write(' Keywords: %s\n' % doc1.core_properties.keywords) log.write(' Language: %s\n' % doc1.core_properties.language) log.write(' Last modified by: %s\n' % doc1.core_properties.last_modified_by) log.write(' Last printed: %s\n' % doc1.core_properties.last_printed) log.write(' Modified: %s\n' % doc1.core_properties.modified) log.write(' Revision: %s\n' % doc1.core_properties.revision) log.write(' Subject: %s\n' % doc1.core_properties.subject) log.write(' Title: %s\n' % doc1.core_properties.title) log.write(' Version: %s\n' % doc1.core_properties.version) print(' Done.') log.write(' Done.\n') print(' Checking export to JSON...') log.write(' Checking export to JSON...\n') doc_json = doc1.to_json() log.write(' Beginning of JSON: %s\n' % doc_json[0:50]) print(' Done.') log.write(' Done.\n') print('Done.') log.write('Done.\n') log.close() errorlog.close()
def testIfFileExists(self): with self.assertRaises(IOError): Document('fakefile.docx')
def testFileTemplate(self): doc1 = Document('testdocs/test.docx') self.assertFalse(doc1.is_template) doc1 = Document('testdocs/test.dotx') self.assertTrue(doc1.is_template)
def testFileMacro(self): doc1 = Document('testdocs/test.docx') self.assertFalse(doc1.is_macro_enabled) doc2 = Document('testdocs/test.docm') self.assertTrue(doc2.is_macro_enabled)