Python Document.Documentの例、officedissector.doc.Document.Document Pythonの例

コード例 #1

0

ファイルを表示

ファイル: test_officedissector.py プロジェクト: randubin/officedissector

 def testAssertions(self):
     # TQA-02.4
     with self.assertRaises(KeyError):
         Document('testdocs/bad_extension.doc')
         self.assertEqual(self.test_stdout.getvalue(),
                          'File extension is not an OOXML file type')
     # Skip this test: The document doesn't follow the spec, but is still openable
     # with self.assertRaisesRegexp(AssertionError,
     #                             'content_type of Part is empty'):
     #    Document('testdocs/missing_content_type.docx')
     with self.assertRaises(KeyError):
         Document('testdocs/missing_part.docx')
         self.assertEqual(
             self.test_stdout.getvalue(),
             'target_path is not a valid Part: /word/endnotes.xml')
     with self.assertRaises(KeyError):
         Document('testdocs/missing_rel_target.docx')
         self.assertEqual(self.test_stdout.getvalue(),
                          'target_path is not a valid Part: /')
     with self.assertRaises(etree.XMLSyntaxError):
         Document('testdocs/corrupt_xml.docx')
         self.assertEqual(
             self.test_stdout.getvalue(),
             'part cannot be parsed successfully: Part [/[Content_Types].xml]'
         )

コード例 #2

0

ファイルを表示

 def testFileName(self):
     doc1 = Document('testdocs/test.docx')
     self.assertTrue(doc1.type == 'Word')
     doc2 = Document('testdocs/test.xlsx')
     self.assertTrue(doc2.type == 'Excel')
     doc3 = Document('testdocs/test.pptx')
     self.assertTrue(doc3.type == 'PowerPoint')

コード例 #3

0

ファイルを表示

    def testRelationships(self):
        doc1 = Document('testdocs/test.docx')
        self.assertEqual(len(doc1.relationships), 13)
        reltype = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument'
        self.assertEqual(doc1.relationships_dict[reltype][0].target,
                         'word/document.xml')
        self.assertEqual(
            doc1.find_relationships_by_type('metadata/core-properties')
            [0].source.name, 'RootPart')
        self.assertEqual(
            doc1.find_relationships_by_type('metadata/core-properties')
            [0].source.content_type(), '(virtual root part)')
        self.assertEqual(
            doc1.find_relationships_by_type('metadata/core-properties')[0].id,
            'rId2')
        self.assertEqual(
            doc1.find_relationships_by_type('metadata/core-properties')
            [0].target, 'docProps/core.xml')
        self.assertEqual(
            doc1.find_relationships_by_type('metadata/core-properties')
            [0].target_part.name, '/docProps/core.xml')
        self.assertEqual(
            doc1.find_relationships_by_type(
                'metadata/core-properties')[0].type,
            'http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties'
        )
        self.assertEqual(
            doc1.find_relationships_by_type('metadata/core-properties')
            [0].is_external, False)

        doc2 = Document('testdocs/216688.docx')
        self.assertEqual(
            doc2.find_relationships_by_type('hyperlink')[0].is_external, True)

        self.assertEqual(
            doc1.part_by_name['/word/document.xml'].relationships_out()[2].id,
            'rId7')
        self.assertEqual(
            doc1.part_by_name['/docProps/app.xml'].relationships_out(), [])
        self.assertEqual(
            doc1.part_by_name['/docProps/app.xml'].relationships_in()[0].id,
            'rId3')

        self.assertEqual(doc1.root_part.relationships_out()[1].id, 'rId2')

        self.assertEqual(
            doc1.parts_by_relationship_type('extended-properties')[0].name,
            '/docProps/app.xml')
        self.assertEqual(
            doc1.parts_by_relationship_type('ships/extended-properties')
            [0].name, '/docProps/app.xml')

        self.assertEqual(doc1.main_part().name, '/word/document.xml')

コード例 #4

0

ファイルを表示

 def testBugs(self):
     # Regression test for BUG OXPA-83
     # Make sure Target_Part='NULL', in this case a Relationship with
     # Type 'image', is handled properly
     doc1 = Document('testdocs/037027.pptx')
     for image in doc1.features.images:
         test = image.name

コード例 #5

0

ファイルを表示

    def testFeatures(self):
        doc1 = Document('testdocs/content.docx')
        self.assertEqual(doc1.features.custom_properties, [])
        self.assertEqual(len(doc1.features.images), 14)
        self.assertEqual([
            i.content_type()
            for i in doc1.features.images if i.name == '/word/media/image1.png'
        ], ['image/png'])
        self.assertEqual(
            sorted(doc1.features.images,
                   key=lambda part: part.name)[0].content_type(), 'image/png')
        self.assertEqual(len(doc1.features.videos), 0)
        self.assertEqual(len(doc1.features.fonts), 2)
        self.assertEqual(
            sorted(doc1.features.fonts, key=lambda part: part.name)[0].name,
            '/word/fonts/font1.odttf')

        doc2 = Document('testdocs/sounds.pptx')
        self.assertEqual(
            sorted(doc2.features.sounds, key=lambda part: part.name)[0].name,
            '/ppt/media/audio1.wav')
        self.assertEqual(
            sorted(doc2.features.sounds,
                   key=lambda part: part.name)[0].content_type(), 'audio/wav')

        doc3 = Document('testdocs/macros.xlsm')
        self.assertEqual(
            sorted(doc3.features.macros, key=lambda part: part.name)[0].name,
            '/xl/vbaProject.bin')
        self.assertEqual(
            sorted(doc3.features.embedded_controls,
                   key=lambda part: part.name)[0].name,
            '/xl/activeX/activeX1.xml')

        doc4 = Document('testdocs/content2.docx')
        self.assertEqual(
            sorted(doc4.features.embedded_packages,
                   key=lambda part: part.name)[2].name,
            '/word/embeddings/Microsoft_Excel-Arbeitsblatt3.xlsx')

        self.assertEqual(len(doc1.features.embedded_objects), 10)
        self.assertEqual(
            sorted(doc1.features.embedded_objects,
                   key=lambda part: part.name)[2].name,
            '/word/embeddings/Microsoft_Office_PowerPoint_97-2003_Presentation7.ppt'
        )

コード例 #6

0

ファイルを表示

 def testPartCollection(self):
     doc1 = Document('testdocs/test.docx')
     self.assertEquals(doc1.parts[0].name, '/[Content_Types].xml')
     self.assertEquals(doc1.parts[0].stream().read(10),
                       b'\x3C\x3F\x78\x6D\x6C\x20\x76\x65\x72\x73')
     self.assertEquals(doc1.parts[2].name, '/word/_rels/document.xml.rels')
     self.assertEquals(doc1.part_by_name['/[Content_Types].xml'].name,
                       '/[Content_Types].xml')
     self.assertEquals(
         doc1.part_by_name['/[Content_Types].xml'].stream().read(10),
         b'\x3C\x3F\x78\x6D\x6C\x20\x76\x65\x72\x73')

コード例 #7

0

ファイルを表示

    def testContentTypes(self):
        doc1 = Document('testdocs/test.docx')
        part1 = doc1.part_by_name['/word/document.xml']
        self.assertEquals(
            part1.content_type(),
            'application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml'
        )
        part2 = Document(
            'testdocs/test.docx').part_by_name['/customXml/item1.xml']
        self.assertEquals(part2.content_type(), 'application/xml')

        self.assertEqual(
            doc1.parts_by_content_type(
                'application/vnd.ms-word.stylesWithEffects+xml')[0].name,
            '/word/stylesWithEffects.xml')
        self.assertEqual(
            doc1.parts_by_content_type_regex('footnotes')[0].name,
            '/word/footnotes.xml')
        self.assertEqual(
            doc1.parts_by_content_type_regex('properties')[1].name,
            '/docProps/app.xml')

コード例 #8

0

ファイルを表示

    def testZipfileProperties(self):
        doc1 = Document('testdocs/test.docx')
        self.assertEquals(doc1.zip().namelist()[0], '[Content_Types].xml')
        self.assertEquals(doc1.zip().comment, '')
        self.assertEquals(
            doc1.zip().part_extract('[Content_Types].xml').read(10),
            b'\x3C\x3F\x78\x6D\x6C\x20\x76\x65\x72\x73')
        self.assertEquals(len(doc1.zip().namelist()), 17)

        doc2 = Document('testdocs/testzipattrib.docx')
        self.assertEquals(
            doc2.zip().part_info('[Content_Types].xml').file_size, 1818)
        self.assertEquals(
            doc2.zip().part_info('[Content_Types].xml').compress_size, 406)
        self.assertEquals(
            doc2.zip().part_info('[Content_Types].xml').date_time,
            (2013, 07, 03, 15, 22, 12))
        self.assertEquals(doc2.zip().part_info('[Content_Types].xml').comment,
                          '')

        with self.assertRaises(ZipCRCError):
            Document('testdocs/badcrc.docx').zip()

コード例 #9

0

ファイルを表示

 def testCoreProperties(self):
     doc1 = Document('testdocs/test.docx')
     self.assertEqual(doc1.core_properties.name, '/docProps/core.xml')
     self.assertEqual(doc1.core_properties.category, 'Auxiliary')
     self.assertEqual(doc1.core_properties.content_status, '')
     self.assertEqual(doc1.core_properties.created, '2010-10-21T08:54:00Z')
     self.assertEqual(doc1.core_properties.creator, 'Klaus-Peter Eckert')
     self.assertEqual(doc1.core_properties.description,
                      'Footnotes and endnotes in different sections')
     self.assertEqual(doc1.core_properties.identifier, '')
     self.assertEqual(doc1.core_properties.keywords,
                      'rainbow, color, colour, couleur')
     self.assertEqual(doc1.core_properties.language, '')
     self.assertEqual(doc1.core_properties.last_modified_by,
                      'Klaus-Peter Eckert')
     self.assertEqual(doc1.core_properties.last_printed, '')
     self.assertEqual(doc1.core_properties.modified, '2010-10-21T09:05:00Z')
     self.assertEqual(doc1.core_properties.revision, '4')
     self.assertEqual(doc1.core_properties.subject, '')
     self.assertEqual(doc1.core_properties.title, '')
     self.assertEqual(doc1.core_properties.version, '')
     doc2 = Document('testdocs/no_core_props.docx')
     self.assertEqual(doc2.core_properties.name, '')

コード例 #10

0

ファイルを表示

 def testExportJSON(self):
     doc1 = Document('testdocs/test.docx')
     self.assertEqual(
         doc1.part_by_name['/word/document.xml'].to_reference(),
         'Part [/word/document.xml]')
     self.assertEqual(
         doc1.part_by_name['/word/document.xml'].relationships_out()
         [2].to_reference(),
         'Relationship [rId7] (source Part [/word/document.xml])')
     self.assertEqual(
         doc1.part_by_name['/word/document.xml'].to_json()[0:30],
         '{\n    "content-type": "applica')
     self.assertEqual(doc1.relationships[0].to_json()[0:32],
                      '{\n    "source": "Part [RootPart]')
     self.assertEqual(doc1.to_json()[0:20], '{\n    "document": [\n')
     self.assertEqual(
         doc1.to_json(include_stream=True)[285:325],
         '     "stream_b64": "PD94bWwgdmVyc2lvbj0i')

コード例 #11

0

ファイルを表示

    def testPartXML(self):
        part1 = Document(
            'testdocs/test.docx').part_by_name['/word/document.xml']
        self.assertEquals(
            list(part1.xml().getroot().iterchildren())[0].tag,
            '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}body'
        )
        self.assertEquals(
            part1.xpath('//@w:val',
                        part1.xml().getroot().nsmap)[2], 'Funotenzeichen')
        part2 = Document(
            'testdocs/test.docx').part_by_name['/[Content_Types].xml']
        xmlns = {
            'ct':
            'http://schemas.openxmlformats.org/package/2006/content-types'
        }
        self.assertEqual(
            part2.xpath('/ct:Types/ct:Override/@PartName', xmlns)[0],
            '/word/document.xml')

        part3 = Document(
            'testdocs/testutf16.docx').part_by_name['/word/document.xml']
        self.assertEquals(
            part3.xml().xpath(
                '//*[local-name() = "lang"]/@*[local-name() = "val"]')[0],
            'en-US')

        part4 = Document(
            'testdocs/testascii.docx').part_by_name['/word/document.xml']
        self.assertEquals(
            part4.xml().xpath(
                '//*[local-name() = "lang"]/@*[local-name() = "val"]')[0],
            'en-US')

        doc5 = Document('testdocs/macros-non-standard.xlsm')
        self.assertEqual(doc5.features.macros[0].name, '/xl/new_name.bin')

        part6 = Document('testdocs/non-standard-namespace.docx'
                         ).part_by_name['/word/document.xml']
        self.assertEquals(
            part1.xpath('//@fake:val',
                        part6.xml().getroot().nsmap)[2], 'Funotenzeichen')

コード例 #12

0

ファイルを表示

ファイル: test_officedissector.py プロジェクト: randubin/officedissector

 def testDenialOfService(self):
     doc = Document('testdocs/dos.docx')

コード例 #13

0

ファイルを表示

ファイル: test_officedissector.py プロジェクト: randubin/officedissector

 def testPseudoFile(self):
     with open("testdocs/macros.xlsm", 'rb') as f:
         pf = BytesIO(f.read())
         Document(pseudofile=pf, filename='macros.xlsm')

コード例 #14

0

ファイルを表示

 def testPart(self):
     part1 = Part(Document('testdocs/test.docx'), '/[Content_Types].xml')
     self.assertEquals(part1.name, '/[Content_Types].xml')
     self.assertEquals(part1.stream().read(10),
                       b'\x3C\x3F\x78\x6D\x6C\x20\x76\x65\x72\x73')

コード例 #15

0

ファイルを表示

    def testDocOpen(self):
        cur_dir = os.path.dirname(os.path.realpath(__file__))
        corpus_path = [
            os.path.join(cur_dir, 'govdocs'),
            os.path.join(cur_dir, 'fraunhoferlibrary')
        ]
        files = []
        for dir_ in corpus_path:
            for f in os.listdir(dir_):
                file_ = os.path.join(dir_, f)
                if os.path.isfile(file_):
                    files.append(file_)

        log = codecs.open('smoke_tests.log',
                          'w',
                          encoding='utf-8',
                          errors="surrogateescape")
        errorlog = codecs.open('smoke_tests_error.log',
                               'w',
                               encoding='utf-8',
                               errors="surrogateescape")

        # Write error log header
        errorlog.write(
            'Three files from govdocs (govdocs/641559.docx, govdocs/500968.xlsx, and\n'
            'govdocs/974690.xlsx) are apparently corrupt and do not open in Microsoft\n'
            'Office 2010; hence, error messages should appear for them. (Note that govdocs\n'
            'is a random sample and explicitly includes bad or corrupt files.)\n'
            'No other error messages should appear here.\n\n\n')

        for docfile in files:
            if (sys.version_info < (3, 0)):
                to_print = docfile.decode('utf8', 'replace')
            else:
                to_print = docfile
            print('\nTesting %s...' % to_print)
            log.write('\nTesting %s...\n' % to_print)
            try:
                doc1 = Document(docfile)
            except ZipCRCError:
                msg = 'Error: Bad CRC for file: %s\n' % to_print
                print(msg)
                log.write(msg)
                errorlog.write(msg)
                continue
            except Exception as e:
                msg = 'Error: File: %s: %s - %s\n' % (
                    to_print, sys.exc_info()[0].__name__, e)
                print(msg)
                log.write(msg)
                errorlog.write(msg)
                continue

            log.write('  Document type is: %s\n' % doc1.type)
            log.write('  Document is macro enabled: %s\n' %
                      doc1.is_macro_enabled)
            log.write('  Document is a template: %s\n' % doc1.is_template)

            print('  Testing zip.part_info method...')
            log.write('  Testing zip.part_info method...\n')
            log.write(
                '    zip.part_info([Content_Types].xml).file_size: %s\n' %
                doc1.zip().part_info('[Content_Types].xml').file_size)
            log.write(
                '    zip.part_info([Content_Types].xml).compress_size: %s\n' %
                doc1.zip().part_info('[Content_Types].xml').compress_size)
            print('  Done.')
            log.write('  Done.\n')

            second_part = doc1.parts[1]
            print('  Testing Part: %s' % second_part.name)
            log.write('  Testing Part: %s\n' % second_part.name)

            doc_stream = doc1.part_by_name[second_part.name].stream().read(10)
            print('  Part stream successfully captured.')
            log.write('  Part stream successfully captured.\n')
            partxml = doc1.part_by_name[second_part.name].xml()
            print('  Part XML successfully parsed.')
            log.write('  Part XML successfully parsed.\n')

            print('  Checking doc.xpath method...')
            log.write('  Checking doc.xpath method...\n')
            log.write('    XPath Result: %s\n' %
                      doc1.part_by_name['/[Content_Types].xml'].xpath(
                          '*/@ContentType')[0])
            print('  Done.')
            log.write('  Done.\n')

            print('  Checking that all Parts can get Content_Type...')
            log.write('  Checking that all Parts can get Content_Type...\n')
            for part in doc1.parts:
                ct = part.content_type()
                log.write('    Part %s is Content_Type: %s\n' %
                          (part.name, ct))
            print('  Done.')
            log.write('  Done.\n')

            print('  Checking that Document has main_part...')
            log.write('  Checking that Document has main_part...\n')
            doc_main = doc1.main_part()
            log.write('    Main Part: %s\n' % doc_main.name)
            print('  Done.')
            log.write('  Done.\n')

            print(
                '  Checking all source and target Relationships for each part...'
            )
            log.write(
                '  Checking all source and target Relationships for each part...\n'
            )
            for part in doc1.parts:
                rel_in = part.relationships_in()
                rel_out = part.relationships_out()
                log.write('    Part %s: Relationships in source name: %s\n' %
                          (part.name, [r.source.name for r in rel_in]))
                log.write('    Part %s: Relationships out: %s\n' %
                          (part.name, [r.target for r in rel_out]))
            print('  Done.')
            log.write('  Done.\n')

            print(
                '  Testing Document methods to find by Part or Relationship...'
            )
            log.write(
                '  Testing Document methods to find by Part or Relationship...\n'
            )
            log.write('    doc.parts_by_content_type(application/xml): %s\n' %
                      doc1.parts_by_content_type('application/xml')[0])
            log.write('    doc.parts_by_content_type_regex(ation/xm): %s\n' %
                      doc1.parts_by_content_type_regex('ation/xm')[0])
            log.write(
                '    doc.parts_by_relationship_type(/relationships/officeDocument: %s\n'
                % doc1.parts_by_relationship_type(
                    '/relationships/officeDocument')[0].name)
            log.write(
                '    doc.find_relationship_by_type(/relationships/officeDocument).source: %s\n'
                % doc1.find_relationships_by_type(
                    '/relationships/officeDocument')[0].source)
            print('  Done.')
            log.write('  Done.\n')

            print('  Checking for all Features...')
            log.write('  Checking for Features...\n')
            for image in doc1.features.images:
                log.write('    Image: %s\n' % image.name)
            for video in doc1.features.videos:
                log.write('    Video: %s\n' % video.name)
            for sound in doc1.features.sounds:
                log.write('    Sound: %s\n' % sound.name)
            for font in doc1.features.fonts:
                log.write('    Font: %s\n' % font.name)
            for macro in doc1.features.macros:
                log.write('    Macro content: %s\n' % macro.name)
            for comment in doc1.features.comments:
                log.write('    Comment content: %s\n' % comment.name)
            for customX in doc1.features.custom_xml:
                log.write('    Custom XML content: %s\n' % customX.name)
            for embedded_control in doc1.features.embedded_controls:
                log.write('    Embedded Control content: %s\n' %
                          embedded_control.name)
            for embedded_object in doc1.features.embedded_objects:
                log.write('    Embedded Object content: %s\n' %
                          embedded_object.name)
            for embedded_package in doc1.features.embedded_packages:
                log.write('    Embedded Package content: %s\n' %
                          embedded_package.name)
            for digital_signature in doc1.features.digital_signatures:
                log.write('    Digital Signature content: %s\n' %
                          digital_signature.name)
            print('  Done.')
            log.write('  Done.\n')

            print('  Checking Core Properties...')
            log.write('  Checking Core Properties...\n')
            log.write('    Category: %s\n' % doc1.core_properties.category)
            log.write('    Content status: %s\n' %
                      doc1.core_properties.content_status)
            log.write('    Created: %s\n' % doc1.core_properties.created)
            log.write('    Creator: %s\n' % doc1.core_properties.creator)
            log.write('    Description: %s\n' %
                      doc1.core_properties.description)
            log.write('    Identifier: %s\n' % doc1.core_properties.identifier)
            log.write('    Keywords: %s\n' % doc1.core_properties.keywords)
            log.write('    Language: %s\n' % doc1.core_properties.language)
            log.write('    Last modified by: %s\n' %
                      doc1.core_properties.last_modified_by)
            log.write('    Last printed: %s\n' %
                      doc1.core_properties.last_printed)
            log.write('    Modified: %s\n' % doc1.core_properties.modified)
            log.write('    Revision: %s\n' % doc1.core_properties.revision)
            log.write('    Subject: %s\n' % doc1.core_properties.subject)
            log.write('    Title: %s\n' % doc1.core_properties.title)
            log.write('    Version: %s\n' % doc1.core_properties.version)
            print('  Done.')
            log.write('  Done.\n')

            print('  Checking export to JSON...')
            log.write('  Checking export to JSON...\n')
            doc_json = doc1.to_json()
            log.write('    Beginning of JSON: %s\n' % doc_json[0:50])
            print('  Done.')
            log.write('  Done.\n')

            print('Done.')
            log.write('Done.\n')

        log.close()
        errorlog.close()

コード例 #16

0

ファイルを表示

 def testIfFileExists(self):
     with self.assertRaises(IOError):
         Document('fakefile.docx')

コード例 #17

0

ファイルを表示

 def testFileTemplate(self):
     doc1 = Document('testdocs/test.docx')
     self.assertFalse(doc1.is_template)
     doc1 = Document('testdocs/test.dotx')
     self.assertTrue(doc1.is_template)

コード例 #18

0

ファイルを表示

 def testFileMacro(self):
     doc1 = Document('testdocs/test.docx')
     self.assertFalse(doc1.is_macro_enabled)
     doc2 = Document('testdocs/test.docm')
     self.assertTrue(doc2.is_macro_enabled)